SQLAdrian · April 17, 2024 03:23
diff --git a/IN versus JOIN performance SQL2022.sql b/IN versus JOIN performance SQL2022.sql
 --Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info;
 --Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64)
 --I will be using the Estimate Subtree cost to compare each select statement.
 --Test carried out on E4 with 10,000 rows. It would be stated where more rows were used.
 --Lookup table is set to 80% of the size of the source table.

 --tl;dr. 
 --It really depends.
 --Generally the IN() and JOIN perform similar. Tested to 1m rows.
 --IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly.
 --JOIN beneftis greatly from appropriate indexes on 250k+ rows
 --NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count
 --NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+

 IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
 	DROP TABLE #maintable;
 CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
 IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
 	DROP TABLE #jointable;
 CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
 /*It's best to crack open the query plan for this and walk through each one*/
 /*Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function*/
 ;WITH E1(N) AS (
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          --10E+1 or 10 rows
       E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
       E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
 	   /*All yours*/
 	   Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million

 INSERT #maintable([N], [Other], Invoice)
 SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END
 , ROW_NUMBER() OVER (Order by N) [Other]
 , 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice
 /*As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input*/
 FROM Emore;

 INSERT #jointable([N], [Other], Invoice)
 SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END 
 , [Other]
 , [Invoice]
 FROM #maintable
 /*You can break the section below out to test as individual sections, just drop the #temp tables afterwards*/

 /*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN. 
 49%/51% split. Lower is better. 
 NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/
 	/*Let's try joining on TINYINT field using NOT(IN))*/
 	SELECT COUNT(T1.ID) FROM #maintable T1
 	WHERE NOT(ID IN ( SELECT ID FROM #jointable))
 	/*Let's try joining on numbers using LEFT OUTER and looking for unjoined items*/
 	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID
 	WHERE T2.ID IS NULL

 /*2. Faster JOIN. Very different profile when excluding on a CHAR field. 
 85%/15% split. Lower is better. At 200k rows this changes to 91%/9%
 NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421
 TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/
 	/*How about NOT(IN)) just joining on a CHAR field*/
 	SELECT COUNT(T1.ID) FROM #maintable T1
 	WHERE NOT(Other IN ( SELECT Other FROM #jointable))
 	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other
 	WHERE T2.ID IS NULL

 /*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN. 
 50%/50% split. Lower is better. Stays consistent on higher row counts
 0.255 cost for both*/
 	/*Let's try joining on tinyint using (IN))*/
 	SELECT COUNT(T1.ID ) 
 	FROM #maintable T1
 	WHERE (ID IN ( SELECT ID FROM #jointable))
 	SELECT COUNT(T1.ID )
 	FROM #maintable T1
 	INNER JOIN #jointable T2 ON T1.ID = T2.ID

 /*4. Tie. Similar performance for CHAR field using IN(). 
 50%/50% split. Lower is better.
 0.396 cost for both*/
 	/*Same as above, just joining on a CHAR field*/
 	SELECT COUNT(T1.ID ) 
 	FROM #maintable T1
 	WHERE (Other IN ( SELECT Other FROM #jointable))
 	SELECT COUNT(T1.ID )
 	FROM #maintable T1
 	INNER JOIN #jointable T2 ON T1.Other = T2.Other
 	/*Even when adding an index on the OTHER field, it remains a 50/50*/

 /*5. Tie. Similar performance for the VARCHAR field using IN(). 
 50%/50% split. Lower is better.
 0.353 cost for both*/
 	SELECT COUNT(T1.ID ) 
 	FROM #maintable T1
 	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
 	SELECT COUNT(T1.ID )
 	FROM #maintable T1
 	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

 /*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing. 
 28%/72% split. Lower is better. At 200k rows this changes to 43%/57%
 0.123 for the IN() and 0.317 for the JOIN
 Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/
 CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice)
 CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice)
 	SELECT COUNT(T1.ID ) 
 	FROM #maintable T1
 	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
 	SELECT COUNT(T1.ID )
 	FROM #maintable T1
 	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

 /*7. Faster JOIN. Very different profile when excluding on a VARCHAR field. 
 85%/15% split. Lower is better. At 200k rows this changes to 92%/8%
 NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/
 	/*How about NOT(IN)) just joining on a CHAR field*/
 	SELECT COUNT(T1.ID) FROM #maintable T1
 	WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable))
 	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
 	WHERE T2.ID IS NULL

 /*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field
 35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27%
 IN() benefits from the index.
 Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/
 CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID)
 CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID)
 	SELECT COUNT(T1.ID ) 
 	FROM #maintable T1
 	WHERE (ID IN ( SELECT ID FROM #jointable))
 	SELECT COUNT(T1.ID )
 	FROM #maintable T1
 	INNER JOIN #jointable T2 ON T1.ID = T2.ID


 /*Clean up*/
 IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
 	DROP TABLE #maintable;
 IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
 	DROP TABLE #jointable;
	--Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info;
	--Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64)
	--I will be using the Estimate Subtree cost to compare each select statement.
	--Test carried out on E4 with 10,000 rows. It would be stated where more rows were used.
	--Lookup table is set to 80% of the size of the source table.

	--tl;dr.
	--It really depends.
	--Generally the IN() and JOIN perform similar. Tested to 1m rows.
	--IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly.
	--JOIN beneftis greatly from appropriate indexes on 250k+ rows
	--NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count
	--NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+

	IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
	CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
	IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;
	CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
	/It's best to crack open the query plan for this and walk through each one/
	/Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function/
	;WITH E1(N) AS (
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
	), --10E+1 or 10 rows
	E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
	E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
	/All yours/
	Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million

	INSERT #maintable([N], [Other], Invoice)
	SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END
	, ROW_NUMBER() OVER (Order by N) [Other]
	, 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice
	/As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input/
	FROM Emore;

	INSERT #jointable([N], [Other], Invoice)
	SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END
	, [Other]
	, [Invoice]
	FROM #maintable
	/You can break the section below out to test as individual sections, just drop the #temp tables afterwards/

	/*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN.
	49%/51% split. Lower is better.
	NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/
	/Let's try joining on TINYINT field using NOT(IN))/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(ID IN ( SELECT ID FROM #jointable))
	/Let's try joining on numbers using LEFT OUTER and looking for unjoined items/
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID
	WHERE T2.ID IS NULL

	/*2. Faster JOIN. Very different profile when excluding on a CHAR field.
	85%/15% split. Lower is better. At 200k rows this changes to 91%/9%
	NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421
	TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/
	/How about NOT(IN)) just joining on a CHAR field/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other
	WHERE T2.ID IS NULL

	/*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN.
	50%/50% split. Lower is better. Stays consistent on higher row counts
	0.255 cost for both*/
	/Let's try joining on tinyint using (IN))/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID

	/*4. Tie. Similar performance for CHAR field using IN().
	50%/50% split. Lower is better.
	0.396 cost for both*/
	/Same as above, just joining on a CHAR field/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Other = T2.Other
	/Even when adding an index on the OTHER field, it remains a 50/50/

	/*5. Tie. Similar performance for the VARCHAR field using IN().
	50%/50% split. Lower is better.
	0.353 cost for both*/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

	/*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing.
	28%/72% split. Lower is better. At 200k rows this changes to 43%/57%
	0.123 for the IN() and 0.317 for the JOIN
	Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/
	CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice)
	CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

	/*7. Faster JOIN. Very different profile when excluding on a VARCHAR field.
	85%/15% split. Lower is better. At 200k rows this changes to 92%/8%
	NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/
	/How about NOT(IN)) just joining on a CHAR field/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
	WHERE T2.ID IS NULL

	/*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field
	35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27%
	IN() benefits from the index.
	Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/
	CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID)
	CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID


	/Clean up/
	IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
	IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;