---------------------------------------------------------------------- -- SQL Server MVP Deep Dives (Manning, 2009) -- Chapter 05 - Gaps and Islands -- © Itzik Ben-Gan ---------------------------------------------------------------------- SET NOCOUNT ON; USE tempdb; -- dbo.NumSeq (numeric sequence with unique values, interval: 1) IF OBJECT_ID('dbo.NumSeq', 'U') IS NOT NULL DROP TABLE dbo.NumSeq; CREATE TABLE dbo.NumSeq ( seqval INT NOT NULL CONSTRAINT PK_NumSeq PRIMARY KEY ); INSERT INTO dbo.NumSeq(seqval) VALUES(2); INSERT INTO dbo.NumSeq(seqval) VALUES(3); INSERT INTO dbo.NumSeq(seqval) VALUES(11); INSERT INTO dbo.NumSeq(seqval) VALUES(12); INSERT INTO dbo.NumSeq(seqval) VALUES(13); INSERT INTO dbo.NumSeq(seqval) VALUES(27); INSERT INTO dbo.NumSeq(seqval) VALUES(33); INSERT INTO dbo.NumSeq(seqval) VALUES(34); INSERT INTO dbo.NumSeq(seqval) VALUES(35); INSERT INTO dbo.NumSeq(seqval) VALUES(42); -- dbo.TempSeq (temporal sequence with unique values, interval: 4 hours) IF OBJECT_ID('dbo.TempSeq', 'U') IS NOT NULL DROP TABLE dbo.TempSeq; CREATE TABLE dbo.TempSeq ( seqval DATETIME NOT NULL CONSTRAINT PK_TempSeq PRIMARY KEY ); INSERT INTO dbo.TempSeq(seqval) VALUES('20090212 00:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090212 04:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090212 12:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090212 16:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090212 20:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090213 08:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090213 20:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090214 00:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090214 04:00'); INSERT INTO dbo.TempSeq(seqval) VALUES('20090214 12:00'); -- dbo.NumSeqDups (numeric sequence with duplicates, interval: 1) IF OBJECT_ID('dbo.NumSeqDups', 'U') IS NOT NULL DROP TABLE dbo.NumSeqDups; CREATE TABLE dbo.NumSeqDups ( seqval INT NOT NULL ); CREATE CLUSTERED INDEX idx_seqval ON dbo.NumSeqDups(seqval); INSERT INTO dbo.NumSeqDups(seqval) VALUES(2); INSERT INTO dbo.NumSeqDups(seqval) VALUES(2); INSERT INTO dbo.NumSeqDups(seqval) VALUES(2); INSERT INTO dbo.NumSeqDups(seqval) VALUES(3); INSERT INTO dbo.NumSeqDups(seqval) VALUES(11); INSERT INTO dbo.NumSeqDups(seqval) VALUES(12); INSERT INTO dbo.NumSeqDups(seqval) VALUES(12); INSERT INTO dbo.NumSeqDups(seqval) VALUES(13); INSERT INTO dbo.NumSeqDups(seqval) VALUES(27); INSERT INTO dbo.NumSeqDups(seqval) VALUES(27); INSERT INTO dbo.NumSeqDups(seqval) VALUES(27); INSERT INTO dbo.NumSeqDups(seqval) VALUES(27); INSERT INTO dbo.NumSeqDups(seqval) VALUES(33); INSERT INTO dbo.NumSeqDups(seqval) VALUES(34); INSERT INTO dbo.NumSeqDups(seqval) VALUES(34); INSERT INTO dbo.NumSeqDups(seqval) VALUES(35); INSERT INTO dbo.NumSeqDups(seqval) VALUES(35); INSERT INTO dbo.NumSeqDups(seqval) VALUES(35); INSERT INTO dbo.NumSeqDups(seqval) VALUES(42); INSERT INTO dbo.NumSeqDups(seqval) VALUES(42); -- dbo.BigNumSeq (big numeric sequence with unique values, interval: 1) IF OBJECT_ID('dbo.BigNumSeq', 'U') IS NOT NULL DROP TABLE dbo.BigNumSeq; CREATE TABLE dbo.BigNumSeq ( seqval INT NOT NULL CONSTRAINT PK_BigNumSeq PRIMARY KEY ); -- Populate table with values in the range 1 through to 10,000,000 -- with a gap every 1000 (total 10,000 gaps) WITH L0 AS(SELECT 1 AS c UNION ALL SELECT 1), L1 AS(SELECT 1 AS c FROM L0 AS A, L0 AS B), L2 AS(SELECT 1 AS c FROM L1 AS A, L1 AS B), L3 AS(SELECT 1 AS c FROM L2 AS A, L2 AS B), L4 AS(SELECT 1 AS c FROM L3 AS A, L3 AS B), L5 AS(SELECT 1 AS c FROM L4 AS A, L4 AS B), Nums AS(SELECT ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS n FROM L5) INSERT INTO dbo.BigNumSeq WITH(TABLOCK) (seqval) SELECT n FROM Nums WHERE n <= 10000000 AND n % 1000 <> 0; ---------------------------------------------------------------------- -- Gaps, Solution 1 using subqueries ---------------------------------------------------------------------- -- Numeric SELECT seqval + 1 AS start_range, -- minimum value that is greater than the current (SELECT MIN(B.seqval) FROM dbo.NumSeq AS B WHERE B.seqval > A.seqval) - 1 AS end_range FROM dbo.NumSeq AS A WHERE NOT EXISTS -- point before gap (SELECT * FROM dbo.NumSeq AS B WHERE B.seqval = A.seqval + 1) -- value smaller than maximum AND seqval < (SELECT MAX(seqval) FROM dbo.NumSeq); -- Temporal SELECT DATEADD(hour, 4, seqval) AS start_range, DATEADD(hour, -4, (SELECT MIN(B.seqval) FROM dbo.TempSeq AS B WHERE B.seqval > A.seqval)) AS end_range FROM dbo.TempSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.TempSeq AS B WHERE B.seqval = DATEADD(hour, 4, A.seqval)) AND seqval < (SELECT MAX(seqval) FROM dbo.TempSeq); -- Dups SELECT seqval + 1 AS start_range, (SELECT MIN(B.seqval) FROM dbo.NumSeqDups AS B WHERE B.seqval > A.seqval) - 1 AS end_range FROM (SELECT DISTINCT seqval FROM dbo.NumSeqDups) AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeqDups AS B WHERE B.seqval = A.seqval + 1) AND seqval < (SELECT MAX(seqval) FROM dbo.NumSeqDups); SELECT DISTINCT seqval + 1 AS start_range, (SELECT MIN(B.seqval) FROM dbo.NumSeqDups AS B WHERE B.seqval > A.seqval) - 1 AS end_range FROM dbo.NumSeqDups AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeqDups AS B WHERE B.seqval = A.seqval + 1) AND seqval < (SELECT MAX(seqval) FROM dbo.NumSeqDups); -- Big -- 8 seconds, 62,262 logical reads SELECT seqval + 1 AS start_range, (SELECT MIN(B.seqval) FROM dbo.BigNumSeq AS B WHERE B.seqval > A.seqval) - 1 AS end_range FROM dbo.BigNumSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.BigNumSeq AS B WHERE B.seqval = A.seqval + 1) AND seqval < (SELECT MAX(seqval) FROM dbo.BigNumSeq); ---------------------------------------------------------------------- -- Gaps, Solution 2 using subqueries ---------------------------------------------------------------------- -- Numeric SELECT cur + 1 AS start_range, nxt - 1 AS end_range FROM (SELECT seqval AS cur, (SELECT MIN(B.seqval) FROM dbo.NumSeq AS B WHERE B.seqval > A.seqval) AS nxt FROM dbo.NumSeq AS A) AS D WHERE nxt - cur > 1; -- Temporal SELECT DATEADD(hour, 4, cur) AS start_range, DATEADD(hour, -4, nxt) AS end_range FROM (SELECT seqval AS cur, (SELECT MIN(B.seqval) FROM dbo.TempSeq AS B WHERE B.seqval > A.seqval) AS nxt FROM dbo.TempSeq AS A) AS D WHERE DATEDIFF(hour, cur, nxt) > 4; -- Dups SELECT cur + 1 AS start_range, nxt - 1 AS end_range FROM (SELECT seqval AS cur, (SELECT MIN(B.seqval) FROM dbo.NumSeqDups AS B WHERE B.seqval > A.seqval) AS nxt FROM (SELECT DISTINCT seqval FROM dbo.NumSeqDups) AS A) AS D WHERE nxt - cur > 1; SELECT DISTINCT cur + 1 AS start_range, nxt - 1 AS end_range FROM (SELECT seqval AS cur, (SELECT MIN(B.seqval) FROM dbo.NumSeqDups AS B WHERE B.seqval > A.seqval) AS nxt FROM dbo.NumSeqDups AS A) AS D WHERE nxt - cur > 1; -- Big -- 48 seconds, 31,875,478 logical reads SELECT cur + 1 AS start_range, nxt - 1 AS end_range FROM (SELECT seqval AS cur, (SELECT MIN(B.seqval) FROM dbo.BigNumSeq AS B WHERE B.seqval > A.seqval) AS nxt FROM dbo.BigNumSeq AS A) AS D WHERE nxt - cur > 1; ---------------------------------------------------------------------- -- Gaps, Solution 3 using ranking functions ---------------------------------------------------------------------- -- Numeric WITH C AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.NumSeq ) SELECT Cur.seqval + 1 AS start_range, Nxt.seqval - 1 AS end_range FROM C AS Cur JOIN C AS Nxt ON Nxt.rownum = Cur.rownum + 1 WHERE Nxt.seqval - Cur.seqval > 1; -- Temporal WITH C AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.TempSeq ) SELECT DATEADD(hour, 4, Cur.seqval) AS start_range, DATEADD(hour, -4, Nxt.Seqval) AS end_range FROM C AS Cur JOIN C AS Nxt ON Nxt.rownum = Cur.rownum + 1 WHERE DATEDIFF(hour, Cur.seqval, Nxt.seqval) > 4; -- Dups WITH C AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM (SELECT DISTINCT seqval FROM dbo.NumSeqDups) AS D ) SELECT Cur.seqval + 1 AS start_range, Nxt.seqval - 1 AS end_range FROM C AS Cur JOIN C AS Nxt ON Nxt.rownum = Cur.rownum + 1 WHERE Nxt.seqval - Cur.seqval > 1; WITH C1 AS ( SELECT seqval, ROW_NUMBER() OVER(PARTITION BY seqval ORDER BY (SELECT 0)) AS dupnum FROM dbo.NumSeqDups ), C2 AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM C1 WHERE dupnum = 1 ) SELECT Cur.seqval + 1 AS start_range, Nxt.seqval - 1 AS end_range FROM C2 AS Cur JOIN C2 AS Nxt ON Nxt.rownum = Cur.rownum + 1 WHERE Nxt.seqval - Cur.seqval > 1; -- Big -- 24 seconds, 32,246 logical reads WITH C AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.BigNumSeq ) SELECT Cur.seqval + 1 AS start_range, Nxt.seqval - 1 AS end_range FROM C AS Cur JOIN C AS Nxt ON Nxt.rownum = Cur.rownum + 1 WHERE Nxt.seqval - Cur.seqval > 1; ---------------------------------------------------------------------- -- Gaps, Solution 4 using cursors ---------------------------------------------------------------------- -- 250 seconds, 16,123 logical reads SET NOCOUNT ON; DECLARE @seqval AS INT, @prvseqval AS INT; DECLARE @Gaps TABLE(start_range INT, end_range INT); DECLARE C CURSOR FAST_FORWARD FOR SELECT seqval FROM dbo.BigNumSeq ORDER BY seqval; OPEN C; FETCH NEXT FROM C INTO @prvseqval; IF @@FETCH_STATUS = 0 FETCH NEXT FROM C INTO @seqval; WHILE @@FETCH_STATUS = 0 BEGIN IF @seqval - @prvseqval > 1 INSERT INTO @Gaps(start_range, end_range) VALUES(@prvseqval + 1, @seqval - 1); SET @prvseqval = @seqval; FETCH NEXT FROM C INTO @seqval; END CLOSE C; DEALLOCATE C; SELECT start_range, end_range FROM @Gaps; ---------------------------------------------------------------------- -- Islands, Solution 1 using subqueries and ranking calculations ---------------------------------------------------------------------- -- Numeric WITH StartingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.NumSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeq AS B WHERE B.seqval = A.seqval - 1) ), EndingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.NumSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeq AS B WHERE B.seqval = A.seqval + 1) ) SELECT S.seqval AS start_range, E.seqval AS end_range FROM StartingPoints AS S JOIN EndingPoints AS E ON E.rownum = S.rownum; -- Temporal WITH StartingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.TempSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.TempSeq AS B WHERE B.seqval = DATEADD(hour, -4, A.seqval)) ), EndingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.TempSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.TempSeq AS B WHERE B.seqval = DATEADD(hour, 4, A.seqval)) ) SELECT S.seqval AS start_range, E.seqval AS end_range FROM StartingPoints AS S JOIN EndingPoints AS E ON E.rownum = S.rownum; -- Dups WITH StartingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM (SELECT DISTINCT seqval FROM dbo.NumSeqDups) AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeqDups AS B WHERE B.seqval = A.seqval - 1) ), EndingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM (SELECT DISTINCT seqval FROM dbo.NumSeqDups) AS A WHERE NOT EXISTS (SELECT * FROM dbo.NumSeqDups AS B WHERE B.seqval = A.seqval + 1) ) SELECT S.seqval AS start_range, E.seqval AS end_range FROM StartingPoints AS S JOIN EndingPoints AS E ON E.rownum = S.rownum; -- Big -- 17 seconds, 64,492 logical reads WITH StartingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.BigNumSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.BigNumSeq AS B WHERE B.seqval = A.seqval - 1) ), EndingPoints AS ( SELECT seqval, ROW_NUMBER() OVER(ORDER BY seqval) AS rownum FROM dbo.BigNumSeq AS A WHERE NOT EXISTS (SELECT * FROM dbo.BigNumSeq AS B WHERE B.seqval = A.seqval + 1) ) SELECT S.seqval AS start_range, E.seqval AS end_range FROM StartingPoints AS S JOIN EndingPoints AS E ON E.rownum = S.rownum; ---------------------------------------------------------------------- -- Islands, Solution 2 using group identifier based on subqueries ---------------------------------------------------------------------- -- Numeric SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, (SELECT MIN(B.seqval) FROM dbo.NumSeq AS B WHERE B.seqval >= A.seqval AND NOT EXISTS (SELECT * FROM dbo.NumSeq AS C WHERE C.seqval = B.seqval + 1)) AS grp FROM dbo.NumSeq AS A) AS D GROUP BY grp; -- Temporal SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, (SELECT MIN(B.seqval) FROM dbo.TempSeq AS B WHERE B.seqval >= A.seqval AND NOT EXISTS (SELECT * FROM dbo.TempSeq AS C WHERE C.seqval = DATEADD(hour, 4, B.seqval))) AS grp FROM dbo.TempSeq AS A) AS D GROUP BY grp; -- Dups SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, (SELECT MIN(B.seqval) FROM dbo.NumSeqDups AS B WHERE B.seqval >= A.seqval AND NOT EXISTS (SELECT * FROM dbo.NumSeqDups AS C WHERE C.seqval = B.seqval + 1)) AS grp FROM dbo.NumSeqDups AS A) AS D GROUP BY grp; -- Big -- aborted execution after 600 seconds SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, (SELECT MIN(B.seqval) FROM dbo.BigNumSeq AS B WHERE B.seqval >= A.seqval AND NOT EXISTS (SELECT * FROM dbo.BigNumSeq AS C WHERE C.seqval = B.seqval + 1)) AS grp FROM dbo.BigNumSeq AS A) AS D GROUP BY grp; ---------------------------------------------------------------------- -- Islands, Solution 3 using group identifier based on ranking calculations ---------------------------------------------------------------------- -- Numeric SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, seqval - ROW_NUMBER() OVER(ORDER BY seqval) AS grp FROM dbo.Numseq) AS D GROUP BY grp; -- Temporal SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, DATEADD(hour, -4 * ROW_NUMBER() OVER(ORDER BY seqval), seqval) AS grp FROM dbo.Tempseq) AS D GROUP BY grp; -- Dups SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, seqval - DENSE_RANK() OVER(ORDER BY seqval) AS grp FROM dbo.Numseq) AS D GROUP BY grp; -- Big -- 10 seconds, 16,123 logical reads SELECT MIN(seqval) AS start_range, MAX(seqval) AS end_range FROM (SELECT seqval, seqval - ROW_NUMBER() OVER(ORDER BY seqval) AS grp FROM dbo.BigNumseq) AS D GROUP BY grp; ---------------------------------------------------------------------- -- Islands, Solution 4 using cursors ---------------------------------------------------------------------- -- 217 seconds, 16,123 logical reads SET NOCOUNT ON; DECLARE @seqval AS INT, @prvseqval AS INT, @first AS INT; DECLARE @Islands TABLE(start_range INT, end_range INT); DECLARE C CURSOR FAST_FORWARD FOR SELECT seqval FROM dbo.BigNumSeq ORDER BY seqval; OPEN C; FETCH NEXT FROM C INTO @seqval; SET @first = @seqval; SET @prvseqval = @seqval; WHILE @@FETCH_STATUS = 0 BEGIN IF @seqval - @prvseqval > 1 BEGIN INSERT INTO @Islands(start_range, end_range) VALUES(@first, @prvseqval); SET @first = @seqval; END SET @prvseqval = @seqval; FETCH NEXT FROM C INTO @seqval; END IF @first IS NOT NULL INSERT INTO @Islands(start_range, end_range) VALUES(@first, @prvseqval); CLOSE C; DEALLOCATE C; SELECT start_range, end_range FROM @Islands; ---------------------------------------------------------------------- -- Variation of Islands Problem ---------------------------------------------------------------------- SET NOCOUNT ON; USE tempdb; IF OBJECT_ID('dbo.T1') IS NOT NULL DROP TABLE dbo.T1; CREATE TABLE dbo.T1 ( id INT NOT NULL PRIMARY KEY, val VARCHAR(10) NOT NULL ); GO INSERT INTO dbo.T1(id, val) VALUES(2, 'a'); INSERT INTO dbo.T1(id, val) VALUES(3, 'a'); INSERT INTO dbo.T1(id, val) VALUES(5, 'a'); INSERT INTO dbo.T1(id, val) VALUES(7, 'b'); INSERT INTO dbo.T1(id, val) VALUES(11, 'b'); INSERT INTO dbo.T1(id, val) VALUES(13, 'a'); INSERT INTO dbo.T1(id, val) VALUES(17, 'a'); INSERT INTO dbo.T1(id, val) VALUES(19, 'a'); INSERT INTO dbo.T1(id, val) VALUES(23, 'c'); INSERT INTO dbo.T1(id, val) VALUES(29, 'c'); INSERT INTO dbo.T1(id, val) VALUES(31, 'a'); INSERT INTO dbo.T1(id, val) VALUES(37, 'a'); INSERT INTO dbo.T1(id, val) VALUES(41, 'a'); INSERT INTO dbo.T1(id, val) VALUES(43, 'a'); INSERT INTO dbo.T1(id, val) VALUES(47, 'c'); INSERT INTO dbo.T1(id, val) VALUES(53, 'c'); INSERT INTO dbo.T1(id, val) VALUES(59, 'c'); WITH C AS ( SELECT id, val, ROW_NUMBER() OVER(ORDER BY id) - ROW_NUMBER() OVER(ORDER BY val, id) AS grp FROM dbo.T1 ) SELECT MIN(id) AS mn, MAX(id) AS mx, val FROM C GROUP BY val, grp ORDER BY mn;