Skip to content

Instantly share code, notes, and snippets.

@LuckyDima
Last active May 16, 2019 10:57
Show Gist options
  • Select an option

  • Save LuckyDima/04ca220f0cf6cb773641a001fb6bf6f4 to your computer and use it in GitHub Desktop.

Select an option

Save LuckyDima/04ca220f0cf6cb773641a001fb6bf6f4 to your computer and use it in GitHub Desktop.
Finding the weight of words for different rows
DECLARE @tbl TABLE (FullName NVARCHAR(MAX))
INSERT @tbl(FullName)
VALUES
(N'All-America Research Team: Basic Materials - Best UAC (Chemicals)'),
('All-America Research Team: Basic Materials - Chemicals'),
('All-America Fixed-Income Research Team: Client Services - Best Credit-Rating Agencies'),
('All-America Fixed-Income Research Team'),
('All-America Research Team'),
('All-America Trading Team'),
('All-Asia Research Team'),
('All-Asia Reserach Team'),
('All-Brazil Research Team'),
('All-China Research Team'),
('All-Europe Equity Research Team'),
('All-Europe Equity Trading Tearm Buy-Side'),
('All-Europe Fixed-Income Research Team'),
('All-Europe Fixed-Income Reserach Team'),
('All-Europe Research Team'),
('All-Europe Trading Team'),
('All-India Research Team'),
('All-India Reserach Team'),
('All-India Web Research Team'),
('All-Japan Research Team'),
('All-Japan Reserach Team'),
('All-Russia Research Team'),
('Emerging EMEA Research Team'),
('Emerging-Markets Equity Research Team'),
('Latin America Research Team'),
('Latin America Reserach Team')
;WITH cte AS
(
SELECT
FullName,
value
FROM @tbl
CROSS APPLY STRING_SPLIT((TRANSLATE(FullName, '+-*/%():',' ')),' ')
),
cte2 AS
(
SELECT
cte.FullName,
cte.value,
SOUNDEX(cte.value) AS 'Sound',
DENSE_RANK() OVER (ORDER BY cte.FullName) AS 'GroupRank',
ROW_NUMBER() OVER (PARTITION BY cte.FullName ORDER BY (SELECT NULL)) AS 'Weight'
FROM cte WHERE cte.value != ''
),
cte3 AS
(
SELECT c.*,
cc.FullName AS _FullName,
cc.value AS _value,
cc.Sound AS _Sound,
cc.GroupRank AS _GroupRank,
cc.Weight AS _Weight,
LAG(cc.Sound, 1,0) OVER (ORDER BY cc.GroupRank,cc.Weight,c.GroupRank,c.Weight) lag,
LEAD(cc.Sound, 1,0) OVER (ORDER BY cc.GroupRank,cc.Weight,c.GroupRank,c.Weight) lead
FROM cte2 c
LEFT JOIN cte2 cc
ON c.GroupRank != cc.GroupRank OR (cc.GroupRank != c.GroupRank AND cc.Weight != c.Weight)
),cte4 AS
(
SELECT *,
CASE
WHEN Sound = _Sound THEN 0
WHEN Sound = lag OR Sound = lead THEN ABS(_Weight - Weight)
END AS nm
FROM cte3
WHERE cte3.Weight =cte3._Weight
OR CASE
WHEN Sound = _Sound THEN 0
WHEN Sound = lag OR Sound = lead THEN _Weight
END IS NOT NULL
),
cte5 AS
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY GroupRank,Weight ORDER BY GroupRank,Weight) rnk2 FROM
cte4
)
SELECT DISTINCT value,
COUNT(value) OVER (PARTITION BY SOUNDEX(value)) AS w
FROM cte5
WHERE rnk2 = 1 AND nm IS NOT NULL
ORDER BY w DESC, value ASC
--SELECT SOUNDEX('Research'), SOUNDEX('Reserach')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment