Created
November 6, 2023 21:11
-
-
Save EcZachly/3a9ce861fe553db3e89cf9c6248be088 to your computer and use it in GitHub Desktop.
How to write an algorithm to group people in optimized groups based on timezone and track
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- first query all the users | |
WITH offsets AS (SELECT a.*, | |
EXTRACT(hour FROM ptn.utc_offset) AS utc_offset | |
FROM bootcamp.attendees a | |
JOIN pg_timezone_names ptn ON a.timezone = ptn.name | |
WHERE a.bootcamp_version = 3 | |
AND a.timezone IS NOT NULL | |
AND a.content_delivery = 'Live'::text | |
), | |
-- then aggregate the users by track and offset, we want matching timezones to fill up first | |
-- Label incomplete groups as "needs merging" | |
offsets_aggregated AS (SELECT offsets.utc_offset, | |
offsets.track, | |
array_agg(offsets.user_id) AS users, | |
count(1) AS cnt, | |
CASE | |
WHEN count(1) < 5 THEN 'needs merging'::text | |
ELSE 'good'::text | |
END AS status | |
FROM offsets | |
GROUP BY offsets.utc_offset, offsets.track), | |
-- For the groups that need merging, round their offset to the nearest 5 so widen up their limits | |
windowed_v2 AS (SELECT CASE | |
WHEN offsets_aggregated.status = 'needs merging'::text | |
THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric | |
ELSE offsets_aggregated.utc_offset | |
END AS utc_offset, | |
offsets_aggregated.track, | |
array_agg(offsets_aggregated.users::text) AS users, | |
CASE | |
WHEN cardinality(string_to_array(array_agg(offsets_aggregated.users::text)::text, | |
','::text)) < 5 THEN 'needs_merging'::text | |
ELSE 'good'::text | |
END AS status | |
FROM offsets_aggregated | |
GROUP BY ( | |
CASE | |
WHEN offsets_aggregated.status = 'needs merging'::text | |
THEN round(offsets_aggregated.utc_offset / 5::numeric) * 5::numeric | |
ELSE offsets_aggregated.utc_offset | |
END), offsets_aggregated.track), | |
-- even after widening timezones, you'll have a few "stragglers" in remote places that are far away from everybody else | |
-- For these users, split the world at London and assign them into a "good" group that is either east or west of London | |
-- Depending on their utc offset | |
windowed_v3 AS (SELECT CASE | |
WHEN windowed_v2.status = 'needs_merging'::text THEN | |
CASE | |
WHEN windowed_v2.utc_offset < 0::numeric THEN min( | |
CASE | |
WHEN windowed_v2.status = 'good'::text | |
THEN windowed_v2.utc_offset | |
ELSE NULL::numeric | |
END) | |
OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) | |
ELSE max( | |
CASE | |
WHEN windowed_v2.status = 'good'::text THEN windowed_v2.utc_offset | |
ELSE NULL::numeric | |
END) | |
OVER (PARTITION BY windowed_v2.track ORDER BY windowed_v2.utc_offset, (windowed_v2.status = 'good'::text) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) | |
END | |
ELSE windowed_v2.utc_offset | |
END AS utc_offset, | |
windowed_v2.utc_offset AS prev_offset, | |
windowed_v2.status, | |
windowed_v2.track, | |
windowed_v2.users | |
FROM windowed_v2), | |
-- This query gets up back to utc offsets and the array of users in each bucket | |
final_merge AS (SELECT windowed_v3.utc_offset, | |
windowed_v3.track, | |
cardinality(string_to_array( | |
regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text, | |
'g'::text), ','::text)) AS num_users, | |
string_to_array( | |
regexp_replace(array_agg(windowed_v3.users::text)::text, '[^0-9,]'::text, ''::text, | |
'g'::text), ','::text) AS users | |
FROM windowed_v3 | |
GROUP BY windowed_v3.utc_offset, windowed_v3.track), | |
-- let's unnest now so we can see how big each group is | |
unnested AS (SELECT unnest(final_merge.users) AS u, | |
cardinality(final_merge.users) AS group_size, | |
final_merge.track, | |
final_merge.utc_offset | |
FROM final_merge), | |
-- let's assign group numbers by dividing by 5 | |
numbered AS (SELECT unnested.u, | |
unnested.group_size, | |
unnested.track, | |
unnested.utc_offset, | |
row_number() OVER (PARTITION BY unnested.track, unnested.utc_offset) / 5 AS group_num | |
FROM unnested), | |
-- now we have the groups ready to go | |
matched_groups AS (SELECT numbered.track, | |
numbered.group_num + 1 AS group_number, | |
numbered.utc_offset, | |
array_agg(numbered.u) AS users | |
FROM numbered | |
GROUP BY numbered.track, (numbered.group_num + 1), numbered.utc_offset), | |
-- But the issue is there are some utc offsets with like.. 7 people | |
-- We should combine the small straggler groups of 2 or 3 | |
combo_groups AS (SELECT mg.track, | |
mg.group_number AS new_group_number, | |
mg2.group_number AS old_group_number, | |
mg.utc_offset, | |
mg2.users || mg.users AS users | |
FROM matched_groups mg | |
JOIN matched_groups mg2 ON mg.track = mg2.track AND mg.utc_offset = mg2.utc_offset AND | |
mg.group_number <> mg2.group_number AND | |
mg.group_number = 1 AND cardinality(mg2.users) < 4 | |
WHERE cardinality(mg2.users || mg.users) < 7), | |
-- bring back the nice name of the utc offset | |
offset_names AS (SELECT offsets.utc_offset, | |
min(offsets.timezone) AS timezone | |
FROM offsets | |
GROUP BY offsets.utc_offset), | |
-- build the group names | |
-- do a FULL OUTER JOIN with combo_groups | |
-- If there's a hit with combo groups, pick that group, otherwise filter it out | |
final_query AS (SELECT mg.track, | |
mg.utc_offset, | |
o.timezone AS group_timezone, | |
((((('V3'::text || ' '::text) || mg.track) || ' '::text) || o.timezone) || ' '::text) || | |
mg.group_number AS group_name, | |
COALESCE(cg.new_group_number, mg.group_number) AS group_number, | |
COALESCE(cg.users, mg.users) AS users, | |
cardinality(COALESCE(cg.users, mg.users)) AS num_users | |
FROM matched_groups mg | |
FULL JOIN combo_groups cg ON mg.track = cg.track AND | |
(mg.group_number = cg.new_group_number OR | |
mg.group_number = cg.old_group_number) AND | |
mg.utc_offset = cg.utc_offset | |
LEFT JOIN offset_names o ON mg.utc_offset = o.utc_offset | |
WHERE cg.new_group_number = mg.group_number | |
OR cg.new_group_number IS NULL | |
ORDER BY mg.track, o.timezone, (COALESCE(cg.new_group_number, mg.group_number))) | |
SELECT final_query.track, | |
final_query.utc_offset, | |
final_query.group_timezone, | |
final_query.group_name, | |
final_query.group_number, | |
final_query.users, | |
final_query.num_users | |
FROM final_query; | |
@SpikeyCoder I tried 3-hour blocks too but didn't get the distribution I was looking for. That's a very easy thing to edit and try differently though!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Really cool stuff. Still working through the different parts, but how did you decide on 5 as the count to increment to and merge? Was it based on the number of time zones in the group?