Collapse date ranges - SQL Server Forums

Please start any new threads on our new site at https://forums.sqlteam.com. We've got lots of great SQL Server experts to answer whatever question you can come up with.

All Forums

General SQL Server Forums

Script Library

Collapse date ranges

Author

Topic

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-25 : 17:57:30

Here is an algorithm that works for both SQL Server 2005 and SQL Server 2008 with full/highest compatibility mode for both.

-- prepare sample data
create table #test(
   a int,
   d1 datetime,
   d2 datetime)

insert into #test
select 1, '20070101', '20070301'
union all
select 1, '20070501', '20070901'
union all
select 2, '20070101', '20070601'
union all
select 2, '20070301', '20070801'

-- Stage the data
;WITH yak1 (a, d1, d2, recid)
as (
	select	a,
		d1,
		d2,
		row_number() over (partition by a order by d1) as recid
	from	#test
), yak2 (a, start, stop, recid, grp)
as (
	select	a,
		d1,
		d2,
		recid,
		0
	from	yak1
	where	recid = 1

	union all

	select		y1.a,
			y1.d1,
			CASE
				WHEN y1.d2 < y2.stop THEN y2.stop
				ELSE y1.d2
			END,
			y1.recid,
			case
				when y2.stop < y1.d1 then y2.grp + 1
				else y2.grp
			end
	from		yak2 as y2
	inner join	yak1 as y1 on y1.a = y2.a
	where		y1.recid = y2.recid + 1
)

-- Show the expected output
select		a,
		min(start),
		max(stop)
from		yak2
group by	a,
		grp
order by	a,
		2
OPTION		(MAXRECURSION 0)

-- Clean up
drop table #test

E 12°55'05.25"
N 56°04'39.16"

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-26 : 12:18:04

Get the "missing" date ranges?

-- Prepare sample data
DECLARE	@Sample TABLE (PlantID INT, FromDate DATETIME, ToDate DATETIME)

INSERT	@Sample
SELECT	1, '20050830', '20060918' UNION ALL
SELECT	1, '20060918', '20061201' UNION ALL
SELECT	2, '20070101', '20070228' UNION ALL
SELECT	2, '20070301', '20070331' UNION ALL
SELECT	1, '20070601', '20071231' UNION ALL
SELECT	1, '20070201', '20070515'

select * from @sample

-- Stage the data
;WITH Yak1 (PlantID, FromDate, ToDate, RecID)
AS (
	SELECT	PlantID,
		FromDate,
		ToDate,
		ROW_NUMBER() OVER (PARTITION BY PlantID ORDER BY FromDate) AS RecID
	FROM	@Sample
), Yak2 (PlantID, FromDate, ToDate, RecID, Grp)
AS (
	SELECT	PlantID,
		FromDate,
		ToDate,
		RecID,
		0
	FROM	Yak1
	WHERE	RecID = 1

	UNION ALL

	SELECT		y1.PlantID,
			y1.FromDate,
			CASE
				WHEN y1.ToDate < y2.ToDate THEN y2.ToDate
				ELSE y1.ToDate
			END,
			y1.RecID,
			CASE
				WHEN y2.ToDate < y1.FromDate THEN y2.Grp + 1
				ELSE y2.Grp
			END
	FROM		Yak2 AS y2
	INNER JOIN	Yak1 AS y1 ON y1.PlantID = y2.PlantID
	WHERE		y1.RecID = y2.RecID + 1
), Yak3 (PlantID, FromDate, ToDate, RowID)
AS (
	SELECT		PlantID,
			MIN(FromDate),
			MAX(ToDate),
			ROW_NUMBER() OVER (PARTITION BY PlantID ORDER BY MIN(FromDate))
	FROM		Yak2
	GROUP BY	PlantID,
			Grp
), Yak4 (PlantID, FromDate, ToDate)
AS (
	SELECT		t1.PlantID,
			t1.ToDate,
			t2.FromDate
	FROM		Yak3 AS t1
	INNER JOIN	Yak3 AS t2 ON t2.PlantID = t1.PlantID AND t2.RowID - 1 = t1.RowID
)

-- Show the expected output
select * from yak4

E 12°55'05.25"
N 56°04'39.16"

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 15:03:38

Here's one for all versions of SQL, and it seems to be much more efficient than the SQL 2005/2008 versions. (Using Peso's sample DDL)


select StartDates.a, StartDates.date as d1, min(EndDates.Date) as d2
from
(	
	select t.a, t.d1 as Date
	from #test t
	left outer join #test t2 on t.a=t2.a and t.d1 > t2.d1 and t.d1 <= t2.d2
	where t2.a is null
) StartDates
inner join
(
	select t.a, t.d2 as Date
    from #test t
	left outer join #test t2 on t.a=t2.a and t.d2 >= t2.d1 and t.d2 < t2.d2
	where t2.a is null
) EndDates 
on
  StartDates.a = EndDates.a and StartDates.Date <= EndDates.Date
group by
	StartDates.a, StartDates.date

Hopefully it is somewhat easy to follow. The key is to eliminate dates that are contained within date ranges since they should not figure into a start or end of a range. That's really about it, and that's what the two derived tables are doing.

- Jeff
http://weblogs.sqlteam.com/JeffS

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 15:16:26

Almost the same deal, this time for the "missing" date ranges, runs on all versions of SQL and seems to be quite more efficient that the sql 2005/2008 solution given:


select fromDates.plantID, fromDates.Date as FromDate, min(ToDates.Date) as ToDate
from
(
	select s1.plantID, s1.ToDate as Date
	from @sample s1
	left outer join @sample s2 on s1.plantID = s2.plantID and 
                                      s1.ToDate >= s2.FromDate and 
                                      s1.ToDate < s2.ToDate
	where s2.plantID is null
) fromDates
inner join
(
	select s1.plantID, s1.FromDate as Date
	from @sample s1
	left outer join @sample s2 on s1.plantID = s2.plantID and 
                                      s1.FromDate > s2.FromDate and 
                                      s1.FromDate <= s2.ToDate 
	where s2.plantID is null
) ToDates
on
  fromDates.PlantID = ToDates.PlantID and FromDates.Date < ToDates.Date
group by 
  fromDates.plantID, fromDates.Date

- Jeff
http://weblogs.sqlteam.com/JeffS

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 15:29:23

This testdata fails

INSERT	@Sample
SELECT	1, '20070101', '20070501' UNION ALL
SELECT	1, '20070501', '20070901'

And this

INSERT	@Sample
SELECT	1, '20070101', '20070501' UNION ALL
SELECT	1, '20070701', '20070901' UNION ALL
SELECT	1, '20070601', '20070801' UNION ALL
SELECT	1, '20070201', '20070301' UNION ALL
SELECT	1, '20070401', '20070601'
/*
1 2 3 4 5 6 7 8 9
|-------|
  |-|
      |---|
          |---|
            |---|
*/

E 12°55'05.25"
N 56°04'39.16"

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 15:34:45

Thanks, Peso!

Is that sample data for the first one or the second one?

For the first one -- fixed easily as noted in red.

As for the second one -- I am confused. based on your sample data, what should that return? It returns the same thing as yours -- no results, since there are no "missing date ranges".

Could you do me a favor and be a little more specific? Thanks!!

- Jeff

http://weblogs.sqlteam.com/JeffS

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 16:11:25

Still reverse engineering your "collapse" algorithm.

Now I understand it

Try this sample data, Jeff

INSERT	#Test
SELECT	1, '20070401', '20070701' UNION ALL
SELECT	1, '20070501', '20070601' UNION ALL
SELECT	1, '20070101', '20070301' UNION ALL
SELECT	1, '20070201', '20070501' UNION ALL
SELECT	2, '20070101', '20070301' UNION ALL
SELECT	2, '20070501', '20070901' UNION ALL
SELECT	2, '20070101', '20070601' UNION ALL
SELECT	2, '20070301', '20070801'
/*
1 2 3 4 5 6 7 8 9
*****************
      |-----|
        |-|
|---|          
  |-----|
*****************
|---|
        |-------|
|---------|
    |---------|
*****************
*/

E 12°55'05.25"
N 56°04'39.16"

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 16:22:42

Same result as yours, one row returned ...

The algorithm is very simple:

Let X = all start dates NOT contained in a date range
Let Y = all end dates NOT contained in a date range

So, X = all of our starting points.

For each X, find the Y with the lowest date greater than X's date that has the same ID

That's it!

I know that these days I mostly yell at posters and say "presentation layer!" and things like that, but I actually am not too shabby at writing T-SQL ... I just don't have as much time or patience to write out answers lately considering that we are never given DDL or sample data ... plus we have you around!

- Jeff
http://weblogs.sqlteam.com/JeffS

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 16:31:50

Must been a copy and paste error. Now I get the correct result.
Well done!

I like having these discussion forth and back and trying to find flaws in each other's algorithms.

E 12°55'05.25"
N 56°04'39.16"

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 16:47:47

Run some tests in SQL Profiler for test data posted last.

	CPU	DURATION	READS	WRITES
Jeff	0	1		210	0
Peso	0	1		162	0

E 12°55'05.25"
N 56°04'39.16"

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 16:57:30

quote:

I like having these discussion forth and back and trying to find flaws in each other's algorithms.

Agreed! That's pretty much why I enjoy coming to this site.

It might be interesting to try the different methods with a decent amount of indexed data to see how things look.

- Jeff
http://weblogs.sqlteam.com/JeffS

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 17:18:25

Tests done!

CREATE TABLE	#Test
		(
			a INT,
			d1 DATETIME,
			d2 DATETIME
		)
GO
INSERT		#Test
		(
			a,
			d1
		)
SELECT TOP 1000	ABS(CHECKSUM(NEWID())) % 2,
		25000 + ABS(CHECKSUM(NEWID())) % 25000
FROM		syscolumns AS c1
CROSS JOIN	syscolumns AS c2
GO
UPDATE	#Test
SET	d2 = DATEADD(DAY, ABS(CHECKSUM(NEWID())) % 30, d1)
GO
CREATE CLUSTERED INDEX ix_test on #Test (a, d1, d2)
GO

Average result from profiler for 20 runs

	CPU	DUR	READS	WRITES
Jeff	19406	19998	551549	0
Peso	  844	  916	 19605	0

E 12°55'05.25"
N 56°04'39.16"

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-30 : 17:46:37

New tests!

CREATE TABLE	#Test
		(
			a INT,
			d1 DATETIME,
			d2 DATETIME
		)
GO
INSERT		#Test
		(
			a,
			d1
		)
SELECT TOP 2000	ABS(CHECKSUM(NEWID())) % 5,
		25000 + ABS(CHECKSUM(NEWID())) % 25000
FROM		syscolumns AS c1
CROSS JOIN	syscolumns AS c2
GO
UPDATE	#Test
SET	d2 = DATEADD(DAY, ABS(CHECKSUM(NEWID())) % 30, d1)
GO
CREATE CLUSTERED INDEX ix_test ON #Test (a, d1, d2)
GO

Average result from profiler for 10 runs

	CPU	DUR	READS	WRITES
Jeff	32219	33920	905521	1	
Peso	 2266	 2368	 36630	0

E 12°55'05.25"
N 56°04'39.16"

jsmith8858
Dr. Cross Join

7423 Posts

Posted - 2007-08-30 : 19:01:27

Wow! That's a huge difference! So much for the initial execution plan. I'll have to investigate this when I get a chance.

- Jeff
http://weblogs.sqlteam.com/JeffS

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2007-08-31 : 01:28:32

To both our defences, the tests show that the time and reads scales linear, not exponential


-- 1k	CPU	DURATION	READS		WRITES
Jeff	    0	     1		    210		0
Peso	    0	     1		    162		0
Peso 2	    0	    77		     31		0

-- 2k	CPU	DURATION	READS		WRITES
Jeff	19,406	19,998		551,549		0
Peso	   844	   916		 19,605		0
Peso 2	    15	    72		     56		0

-- 3k	CPU	DURATION	READS		WRITES
Jeff	39,359	41,557		956,584		3
Peso	 5,172	 5,381		 62,984		0
Peso 2	    16	    81		     91		0

-- 4k	CPU	DURATION	READS		WRITES
Jeff	52,235	55,708		1,237,692	0
Peso	 9,172	 9,492		   99,976	0
Peso 2	    31	    76		      106	0

-- 5k	CPU	DURATION	READS		WRITES
Jeff	95,438	100,383		2,202,898	0
Peso	13,953	 14,985		  140,024	0
Peso 2	    31	     78		      166	0

-- 6k	CPU	DURATION	READS		WRITES
Jeff	107,328	112,983		2,420,560	0
Peso	 20,968	 23,352		  192,023	1
Peso 2	     31	    118		      233	0

-- 10k	CPU	DURATION	READS		WRITES
Peso 2	     48	     97		      290	0

-- 100k	CPU	DURATION	READS		WRITES
Peso 2	    140	    273		    3,002	0

--1000k	CPU	DURATION	READS		WRITES
Peso 2	  1,593	  1,598		   15,322	0

E 12°55'05.25"
N 56°04'39.16"

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2008-05-13 : 10:17:37

Also see
http://weblogs.sqlteam.com/peterl/archive/2008/05/13/Lightning-fast-collapsed-date-ranges-and-missing-date-ranges.aspx

E 12°55'05.25"
N 56°04'39.16"

Michael Valentine Jones
Yak DBA Kernel (pronounced Colonel)

7020 Posts

Posted - 2008-05-13 : 11:55:41

Hate to be dense (stupid) here, but can you give a little more explanation about the problem you are trying to solve?

Before I read through all the code, it would be helpful to know what it is supposed to do.

CODO ERGO SUM

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2008-05-13 : 12:53:35

With collapsed date ranges, the purpose is the merge overlapping date ranges as a few possible.

1 2 3 4 5 6 7 8 9
*****************
      |-----|
        |-|
|---|
  |-----|
*****************

See for example these four date ranges. They can be merged as one because they overlap.
Result should be one date range 1-7.

In this example, the four ranges can only be merged into two since there is a gap.

1 2 3 4 5 6 7 8 9
*****************
        |----|
           |--|
|---|
  |---|
*****************

Result is 1-4 and 5-8.
This example above also produces a gap between 4-5.

This is what the blog post is about, how to efficiently and fast get the wanted results.
Either the collapsed (merged) date ranges, or the missing (gaps) date ranges.

E 12°55'05.25"
N 56°04'39.16"

lmontgomery74
Starting Member

1 Post

Posted - 2010-12-22 : 18:18:54

Great post guys... Thanks!

SwePeso
Patron Saint of Lost Yaks

30421 Posts

Posted - 2011-04-07 : 11:32:32

Here is an algorithm that will work not only with days only, but also with millisecond precision and several years in the intervals.

;WITH cteSource(UserName, theTime, theGrp)
AS (
	SELECT		u.UserName,
			u.theTime,
			(DENSE_RANK() OVER (PARTITION BY u.UserName ORDER BY u.theTime) - 1) / 2 AS theGrp	-- Create artificial groups depending
														-- on the times. Omit duplicates.
	FROM		(
				-- Get all usernames and first start time and last endtime
				SELECT		a AS UserName,
						MIN(d1) AS minStartTime,
						MAX(d2) AS maxEndTIme
				FROM		#Test
				GROUP BY	a
			) AS usr
			-- This only get the intermediate gaps
	OUTER APPLY	(
				SELECT		s.StartTime,
						e.EndTime
				FROM		(
							-- Get all starttimes sorted
							SELECT	s.d1 AS StartTime,
								ROW_NUMBER() OVER (ORDER BY s.d1) AS SeqID
							FROM	#Test AS s
							WHERE	s.a = usr.UserName
						) AS s
				INNER JOIN	(
							-- Get all endtimes sorted
							SELECT	s.d2 AS EndTime,
								ROW_NUMBER() OVER (ORDER BY s.d2) + 1 AS SeqID
							FROM	#Test AS s
							WHERE	s.a = usr.UserName
						) AS e ON e.SeqID = s.SeqID	-- Match previous end time time against this starttime
				WHERE		e.EndTime < s.StartTime		-- If EndTime is less than starttime, this is a gap
			) AS bnd
	UNPIVOT		(
				-- Since the bnd table only get the intermediate gaps,
				-- add the first start time and last end time to the intervals
				theTime
				FOR theCol IN (usr.minStartTime, usr.maxEndTime, bnd.StartTime, bnd.EndTime)
			) AS u
)
-- For each artifical group, display the earliest and latest time
SELECT		UserName,
		MIN(theTime) AS StartTime,
		MAX(theTime) AS EndTime
FROM		cteSource
GROUP BY	UserName,
		theGrp;

N 56°04'39.26"
E 12°55'05.63"

Subscribe to SQLTeam.com

SQLTeam.com Articles via RSS

SQLTeam.com Weblog via RSS

- Advertisement -

Resources