Last active
April 17, 2024 03:23
-
-
Save SQLAdrian/a2fe74af4abcc00a4e9147dfcb68288f to your computer and use it in GitHub Desktop.
IN versus JOIN performance SQL2022
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info; | |
--Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64) | |
--I will be using the Estimate Subtree cost to compare each select statement. | |
--Test carried out on E4 with 10,000 rows. It would be stated where more rows were used. | |
--Lookup table is set to 80% of the size of the source table. | |
--tl;dr. | |
--It really depends. | |
--Generally the IN() and JOIN perform similar. Tested to 1m rows. | |
--IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly. | |
--JOIN beneftis greatly from appropriate indexes on 250k+ rows | |
--NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count | |
--NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+ | |
IF OBJECT_ID('tempdb..#maintable') IS NOT NULL | |
DROP TABLE #maintable; | |
CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20)) | |
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL | |
DROP TABLE #jointable; | |
CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20)) | |
/*It's best to crack open the query plan for this and walk through each one*/ | |
/*Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function*/ | |
;WITH E1(N) AS ( | |
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL | |
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL | |
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 | |
), --10E+1 or 10 rows | |
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows | |
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max | |
/*All yours*/ | |
Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million | |
INSERT #maintable([N], [Other], Invoice) | |
SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END | |
, ROW_NUMBER() OVER (Order by N) [Other] | |
, 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice | |
/*As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input*/ | |
FROM Emore; | |
INSERT #jointable([N], [Other], Invoice) | |
SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END | |
, [Other] | |
, [Invoice] | |
FROM #maintable | |
/*You can break the section below out to test as individual sections, just drop the #temp tables afterwards*/ | |
/*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN. | |
49%/51% split. Lower is better. | |
NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/ | |
/*Let's try joining on TINYINT field using NOT(IN))*/ | |
SELECT COUNT(T1.ID) FROM #maintable T1 | |
WHERE NOT(ID IN ( SELECT ID FROM #jointable)) | |
/*Let's try joining on numbers using LEFT OUTER and looking for unjoined items*/ | |
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID | |
WHERE T2.ID IS NULL | |
/*2. Faster JOIN. Very different profile when excluding on a CHAR field. | |
85%/15% split. Lower is better. At 200k rows this changes to 91%/9% | |
NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421 | |
TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/ | |
/*How about NOT(IN)) just joining on a CHAR field*/ | |
SELECT COUNT(T1.ID) FROM #maintable T1 | |
WHERE NOT(Other IN ( SELECT Other FROM #jointable)) | |
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other | |
WHERE T2.ID IS NULL | |
/*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN. | |
50%/50% split. Lower is better. Stays consistent on higher row counts | |
0.255 cost for both*/ | |
/*Let's try joining on tinyint using (IN))*/ | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
WHERE (ID IN ( SELECT ID FROM #jointable)) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
INNER JOIN #jointable T2 ON T1.ID = T2.ID | |
/*4. Tie. Similar performance for CHAR field using IN(). | |
50%/50% split. Lower is better. | |
0.396 cost for both*/ | |
/*Same as above, just joining on a CHAR field*/ | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
WHERE (Other IN ( SELECT Other FROM #jointable)) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
INNER JOIN #jointable T2 ON T1.Other = T2.Other | |
/*Even when adding an index on the OTHER field, it remains a 50/50*/ | |
/*5. Tie. Similar performance for the VARCHAR field using IN(). | |
50%/50% split. Lower is better. | |
0.353 cost for both*/ | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
WHERE (Invoice IN ( SELECT Invoice FROM #jointable)) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice | |
/*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing. | |
28%/72% split. Lower is better. At 200k rows this changes to 43%/57% | |
0.123 for the IN() and 0.317 for the JOIN | |
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/ | |
CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice) | |
CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
WHERE (Invoice IN ( SELECT Invoice FROM #jointable)) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice | |
/*7. Faster JOIN. Very different profile when excluding on a VARCHAR field. | |
85%/15% split. Lower is better. At 200k rows this changes to 92%/8% | |
NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/ | |
/*How about NOT(IN)) just joining on a CHAR field*/ | |
SELECT COUNT(T1.ID) FROM #maintable T1 | |
WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable)) | |
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice | |
WHERE T2.ID IS NULL | |
/*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field | |
35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27% | |
IN() benefits from the index. | |
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/ | |
CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID) | |
CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
WHERE (ID IN ( SELECT ID FROM #jointable)) | |
SELECT COUNT(T1.ID ) | |
FROM #maintable T1 | |
INNER JOIN #jointable T2 ON T1.ID = T2.ID | |
/*Clean up*/ | |
IF OBJECT_ID('tempdb..#maintable') IS NOT NULL | |
DROP TABLE #maintable; | |
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL | |
DROP TABLE #jointable; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment