Please start any new threads on our new site at https://forums.sqlteam.com. We've got lots of great SQL Server experts to answer whatever question you can come up with.

 All Forums
 SQL Server 2012 Forums
 Transact-SQL (2012)
 reduce unnecessary records

Author  Topic 

sql_server_dba
Posting Yak Master

167 Posts

Posted - 2013-08-21 : 20:22:22
I have a history table which has the records that are not really necessary. Can anyone please help me how to reduce these records?

ItemNumber Start Date ItemValue End Date
111 12/15/2010 9:26 2 8/17/2011 14:06
111 8/17/2011 14:06 2 8/27/2011 3:36
111 8/27/2011 3:36 2 12/31/2099 0:00
222 12/15/2010 9:26 2 4/13/2011 18:36
222 4/13/2011 18:36 2 8/17/2011 14:06
222 8/17/2011 14:06 3 10/1/2011 19:40
222 10/1/2011 19:40 2 12/31/2099 0:00
333 12/15/2010 9:26 4 7/3/2011 1:37
333 7/2/2011 20:37 4 7/3/2011 1:37
333 7/3/2011 1:37 4 2/19/2012 1:36
333 2/19/2012 1:36 5 12/31/2099 0:00

Required output is below....
ItemNumber Start Date ItemValue End Date
111 12/15/2010 9:26 2 12/31/2099 0:00
222 12/15/2010 9:26 2 8/17/2011 14:06
222 8/17/2011 14:06 3 10/1/2011 19:40
222 10/1/2011 19:40 2 12/31/2099 0:00
333 12/15/2010 9:26 4 2/19/2012 1:36
333 2/19/2012 1:36 5 12/31/2099 0:00

bandi
Master Smack Fu Yak Hacker

2242 Posts

Posted - 2013-08-22 : 00:38:01
On which logic those records should be reduced?
Explain the output with logic....

--
Chandu
Go to Top of Page

sql_server_dba
Posting Yak Master

167 Posts

Posted - 2013-08-22 : 01:14:03
Its the itemNumber and itemValue. For every change in ItemValue for an ItemNumber, a new record should be inserted. But in the current table,even when there was no change in the ItemValue, there is a new record. Hope this helps.
Go to Top of Page

bandi
Master Smack Fu Yak Hacker

2242 Posts

Posted - 2013-08-22 : 02:14:59
Use case statement with LEAD() function
SELECT * 
FROM (SELECT *,CASE WHEN ItemValue = LEAD(ItemValue) OVER(PARTITION BY ItemNumber ORDER BY StartDate) THEN 0 ELSE 1 ENS Flag
FROM TableNAme)T
WHERE T.Flag = 1

NOTE: LEAD() OVER() is available in MSSQL 2012
Refer this link for LAG() & LEAD() functions
http://www.itdeveloperzone.com/2012/04/lead-and-lag-functions-in-sql-server.html
--
Chandu
Go to Top of Page

lazerath
Constraint Violating Yak Guru

343 Posts

Posted - 2013-08-22 : 10:27:21
Bandi, I don't believe your solution solves the problem. sql_server_dba: you also want to collapse the date periods?
Go to Top of Page

lazerath
Constraint Violating Yak Guru

343 Posts

Posted - 2013-08-22 : 11:11:15
[code]

DECLARE @StartingData TABLE (ItemNumber INT,StartDate DATETIME,ItemValue MONEY,EndDate DATETIME);
DECLARE @SimpleLEADResult TABLE (ItemNumber INT,StartDate DATETIME,ItemValue MONEY,EndDate DATETIME);
DECLARE @NewResult TABLE (ItemNumber INT,StartDate DATETIME,ItemValue MONEY,EndDate DATETIME);
DECLARE @DesiredResult TABLE (ItemNumber INT,StartDate DATETIME,ItemValue MONEY,EndDate DATETIME);

INSERT @StartingData
VALUES
(111,'12/15/2010 9:26',2,'8/17/2011 14:06'),
(111,'8/17/2011 14:06',2,'8/27/2011 3:36'),
(111,'8/27/2011 3:36',2,'12/31/2099 0:00'),
(222,'12/15/2010 9:26',2,'4/13/2011 18:36'),
(222,'4/13/2011 18:36',2,'8/17/2011 14:06'),
(222,'8/17/2011 14:06',3,'10/1/2011 19:40'),
(222,'10/1/2011 19:40',2,'12/31/2099 0:00'),
(333,'12/15/2010 9:26',4,'7/3/2011 1:37'),
(333,'7/2/2011 20:37',4,'7/3/2011 1:37'),
(333,'7/3/2011 1:37',4,'2/19/2012 1:36'),
(333,'2/19/2012 1:36',5,'12/31/2099 0:00');

INSERT @DesiredResult
VALUES
(111,'12/15/2010 9:26',2,'12/31/2099 0:00'),
(222,'12/15/2010 9:26',2,'8/17/2011 14:06'),
(222,'8/17/2011 14:06',3,'10/1/2011 19:40'),
(222,'10/1/2011 19:40',2,'12/31/2099 0:00'),
(333,'12/15/2010 9:26',4,'2/19/2012 1:36'),
(333,'2/19/2012 1:36',5,'12/31/2099 0:00');

INSERT @SimpleLEADResult
SELECT ItemNumber,StartDate,ItemValue,EndDate
FROM (SELECT *, CASE WHEN ItemValue = LEAD(ItemValue) OVER(PARTITION BY ItemNumber ORDER BY StartDate) THEN 0 ELSE 1 END AS Flag
FROM @StartingData) T
WHERE T.Flag = 1;

/*
ItemNumber StartDate ItemValue EndDate Flag
111 2011-08-27 03:36:00.000 2.00 2099-12-31 00:00:00.000 1
222 2011-04-13 18:36:00.000 2.00 2011-08-17 14:06:00.000 1
222 2011-08-17 14:06:00.000 3.00 2011-10-01 19:40:00.000 1
222 2011-10-01 19:40:00.000 2.00 2099-12-31 00:00:00.000 1
333 2011-07-03 01:37:00.000 4.00 2012-02-19 01:36:00.000 1
333 2012-02-19 01:36:00.000 5.00 2099-12-31 00:00:00.000 1
*/

SELECT * FROM @DesiredResult EXCEPT SELECT * FROM @SimpleLEADResult;

/* -- Rows from @DesiredResult that aren't the same as @SimpleLEADResult;
ItemNumber StartDate ItemValue EndDate
111 2010-12-15 09:26:00.000 2.00 2099-12-31 00:00:00.000
222 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000
333 2010-12-15 09:26:00.000 4.00 2012-02-19 01:36:00.000
*/

WITH cte1
AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY ItemNumber ORDER BY StartDate) AS SEQUENCE,
CASE WHEN ItemValue = LEAD(ItemValue) OVER(PARTITION BY ItemNumber ORDER BY StartDate) THEN 0 ELSE 1 END AS ChangeFlag
FROM @StartingData
)
, cte2
AS
(
SELECT *,
MIN(CASE ChangeFlag WHEN 1 THEN SEQUENCE END) OVER(PARTITION BY ItemNumber ORDER BY SEQUENCE ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) SeqGroup
FROM cte1
)
INSERT @NewResult
SELECT c.ItemNumber, MIN(c.StartDate) AS StartDate, c.ItemValue, MAX(c.EndDate) AS EndDate
FROM cte2 AS c
GROUP BY c.ItemNumber, c.ItemValue, c.SeqGroup
ORDER BY c.ItemNumber, MIN(c.StartDate)

/*
ItemNumber StartDate ItemValue EndDate
111 2010-12-15 09:26:00.000 2.00 2099-12-31 00:00:00.000
222 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000
222 2011-08-17 14:06:00.000 3.00 2011-10-01 19:40:00.000
222 2011-10-01 19:40:00.000 2.00 2099-12-31 00:00:00.000
333 2010-12-15 09:26:00.000 4.00 2012-02-19 01:36:00.000
333 2012-02-19 01:36:00.000 5.00 2099-12-31 00:00:00.000
*/

SELECT * FROM @DesiredResult EXCEPT SELECT * FROM @NewResult

/* -- Rows from @DesiredResult that aren't the same as @NewResult;
(0 row(s) affected)
*/
[/code]

* Edit: I found a bug I introduced by putting aliases on cte2. Fixed.
Go to Top of Page

lazerath
Constraint Violating Yak Guru

343 Posts

Posted - 2013-08-22 : 11:20:54
Also, while this solves your problem from the logical sense, it does not solve it from the logistical sense. How big is this history table? If you are working on systems with millions or billions of rows, you cannot simply run this query against it. You will likely need to batch this as well as a strategy to handle changes while the batch process is running. If your table is small and you can handle a brief outage, you might be able to get away with locking the table and doing a MERGE.
Go to Top of Page

sql_server_dba
Posting Yak Master

167 Posts

Posted - 2013-08-22 : 11:53:54
Lazerath...seems like Bandi solution solves the problem(Not sure if i missing anything there). Yeah, i want to collapse the periods too but only if the ItemValue changes. BTW..even you solution works with CTE's.

For example...for ItemNumber 111, item value didn't changed so i wanted to collapse all 3 records into one. But for ItemNumber 222...itemValue changed from 2 to 3 and then from 3 to 2. So i need 3 records for ItemNumber 222.
Go to Top of Page

lazerath
Constraint Violating Yak Guru

343 Posts

Posted - 2013-08-22 : 12:04:58
Take a look at the code I posted. You can run the whole thing and it walks you through the differences between Bandi's solution and the end state you were looking for. The major difference is that Bandi's solution did not collapse the records, all it did was identify the last item value record before a change. This is key to the final solution, but is not the solution in and of itself.

For instance, for item 111, you indicated this should be the result:
111 12/15/2010 9:26 2 12/31/2099 0:00

However, in Bandi's solution, it provides this:
111 8/27/2011 3:36 2 12/31/2099 0:00

Notice the Start Date. Bandi's solution does not collapse the 3 records into 1, it simply returns the last record. I had to add code to properly group sequential history records that did not change. This was accomplished with this code:

WITH cte1
AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY ItemNumber ORDER BY StartDate) AS SEQUENCE,
CASE WHEN ItemValue = LEAD(ItemValue) OVER(PARTITION BY ItemNumber ORDER BY StartDate) THEN 0 ELSE 1 END AS ChangeFlag
FROM @StartingData
)
, cte2
AS
(
SELECT *,
MIN(CASE ChangeFlag WHEN 1 THEN SEQUENCE END) OVER(PARTITION BY ItemNumber ORDER BY SEQUENCE ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) SeqGroup
FROM cte1
)
SELECT c.ItemNumber, MIN(c.StartDate) AS StartDate, c.ItemValue, MAX(c.EndDate) AS EndDate
FROM cte2 AS c
GROUP BY c.ItemNumber, c.ItemValue, c.SeqGroup
ORDER BY c.ItemNumber, MIN(c.StartDate)


cte1 gets the change boundaries as Bandi provided but adds in a Sequence. cte2 returns the MIN sequence with a change that is greater than or equal to the current record. This allows us to group sequential entries that do not change and produce the result you requested.
Go to Top of Page

sql_server_dba
Posting Yak Master

167 Posts

Posted - 2013-08-22 : 12:08:29
Good catch....Thanks Lazerath.
Go to Top of Page

lazerath
Constraint Violating Yak Guru

343 Posts

Posted - 2013-08-22 : 12:23:10
No problem, glad to help.

As it just so happens, I found an optimization to eliminate the GROUP BY and replace the Sequence with a different boundary identifier:


WITH cte1
AS
(
SELECT c.ItemNumber,c.StartDate,c.ItemValue,c.EndDate,
CASE WHEN c.ItemValue = LAG(c.ItemValue) OVER(PARTITION BY c.ItemNumber ORDER BY c.StartDate) THEN 0 ELSE 1 END AS isChangeBoundaryBeg,
CASE WHEN c.ItemValue = LEAD(c.ItemValue) OVER(PARTITION BY c.ItemNumber ORDER BY c.StartDate) THEN 0 ELSE 1 END AS isChangeBoundaryEnd
FROM @StartingData AS c
)
/* --SELECT * FROM cte1
ItemNumber StartDate ItemValue EndDate isChangeBoundaryBeg isChangeBoundaryEnd
111 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000 1 0
111 2011-08-17 14:06:00.000 2.00 2011-08-27 03:36:00.000 0 0
111 2011-08-27 03:36:00.000 2.00 2099-12-31 00:00:00.000 0 1
222 2010-12-15 09:26:00.000 2.00 2011-04-13 18:36:00.000 1 0
222 2011-04-13 18:36:00.000 2.00 2011-08-17 14:06:00.000 0 1
222 2011-08-17 14:06:00.000 3.00 2011-10-01 19:40:00.000 1 1
222 2011-10-01 19:40:00.000 2.00 2099-12-31 00:00:00.000 1 1
333 2010-12-15 09:26:00.000 4.00 2011-07-03 01:37:00.000 1 0
333 2011-07-02 20:37:00.000 4.00 2011-07-03 01:37:00.000 0 0
333 2011-07-03 01:37:00.000 4.00 2012-02-19 01:36:00.000 0 1
333 2012-02-19 01:36:00.000 5.00 2099-12-31 00:00:00.000 1 1
*/
, cte2
AS
(
SELECT c.ItemNumber,
MAX(CASE c.isChangeBoundaryBeg WHEN 1 THEN c.StartDate END) OVER(PARTITION BY c.ItemNumber ORDER BY c.StartDate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS StartDate,
c.ItemValue,
c.EndDate,
c.isChangeBoundaryEnd
FROM cte1 as c
)
/* --SELECT * FROM cte2
ItemNumber StartDate ItemValue EndDate isChangeBoundaryEnd
111 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000 0
111 2010-12-15 09:26:00.000 2.00 2011-08-27 03:36:00.000 0
111 2010-12-15 09:26:00.000 2.00 2099-12-31 00:00:00.000 1
222 2010-12-15 09:26:00.000 2.00 2011-04-13 18:36:00.000 0
222 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000 1
222 2011-08-17 14:06:00.000 3.00 2011-10-01 19:40:00.000 1
222 2011-10-01 19:40:00.000 2.00 2099-12-31 00:00:00.000 1
333 2010-12-15 09:26:00.000 4.00 2011-07-03 01:37:00.000 0
333 2010-12-15 09:26:00.000 4.00 2011-07-03 01:37:00.000 0
333 2010-12-15 09:26:00.000 4.00 2012-02-19 01:36:00.000 1
333 2012-02-19 01:36:00.000 5.00 2099-12-31 00:00:00.000 1
*/
INSERT @NewResult
SELECT c.ItemNumber,c.StartDate,c.ItemValue,c.EndDate
FROM cte2 AS c
WHERE c.isChangeBoundaryEnd = 1
ORDER BY c.ItemNumber, c.StartDate

/*
ItemNumber StartDate ItemValue EndDate
111 2010-12-15 09:26:00.000 2.00 2099-12-31 00:00:00.000
222 2010-12-15 09:26:00.000 2.00 2011-08-17 14:06:00.000
222 2011-08-17 14:06:00.000 3.00 2011-10-01 19:40:00.000
222 2011-10-01 19:40:00.000 2.00 2099-12-31 00:00:00.000
333 2010-12-15 09:26:00.000 4.00 2012-02-19 01:36:00.000
333 2012-02-19 01:36:00.000 5.00 2099-12-31 00:00:00.000
*/


* Edit: Added data to show internal CTE results
Go to Top of Page

sql_server_dba
Posting Yak Master

167 Posts

Posted - 2013-08-22 : 17:10:00
Awesome...this is great!!! Thanks.
Go to Top of Page
   

- Advertisement -