Last active
February 5, 2025 19:04
-
-
Save travisjungroth/0c2c7c6ec46f7747a470ee42ba977c48 to your computer and use it in GitHub Desktop.
What last name has appeared in the MLB each year for the longest time?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import pandas as pd | |
# Data is from seanlahman.com | |
people = pd.read_csv( | |
"People.csv", | |
encoding="latin-1", | |
low_memory=False | |
) | |
appearances = pd.read_csv( | |
"Appearances.csv", | |
encoding="latin-1", | |
low_memory=False | |
) | |
player_id_to_last_name: dict[str, str] = {} | |
last_name_to_ids: defaultdict[str, set[str]] = defaultdict(set) | |
for _, row in people.iterrows(): | |
pid = row["playerID"] | |
last_name = row["nameLast"] | |
player_id_to_last_name[pid] = last_name | |
last_name_to_ids[last_name].add(pid) | |
player_id_to_years: defaultdict[str, set[int]] = defaultdict(set) | |
last_name_to_years: defaultdict[str, set[int]] = defaultdict(set) | |
for _, row in appearances.iterrows(): | |
pid = row["playerID"] | |
year = int(row["yearID"]) | |
player_id_to_years[pid].add(year) | |
last_name = player_id_to_last_name.get(pid) | |
last_name_to_years[last_name].add(year) | |
possible_last_names = set(last_name_to_ids) | |
last_name_to_start: dict[str, int] = {} | |
for start_year in range(1871, 2024): | |
target_years = set(range(start_year, 2024)) | |
# the names where all the target years have someone with that last name | |
last_names_full_coverage: set[str] = set() | |
for last_name in list(possible_last_names): | |
years = last_name_to_years[last_name] | |
if years.issuperset(target_years): | |
last_names_full_coverage.add(last_name) | |
possible_last_names.remove(last_name) | |
last_name_to_start[last_name] = start_year | |
print("2023") | |
top = sorted(last_name_to_start.items(), key=lambda x: x[1])[:10] | |
for i, (name, year) in enumerate(top, start=1): | |
print(f"{i}. {name}: {year}") | |
""" | |
2023 | |
1. Johnson: 1907 | |
2. Miller: 1939 | |
3. Jones: 1945 | |
4. Jackson: 1950 | |
5. Smith: 1950 | |
6. Davis: 1951 | |
7. Reed: 1958 | |
8. Martinez: 1962 | |
9. Perez: 1964 | |
10. Hernandez: 1965 | |
""" | |
# Redo it and remove Reed, since that was a top 10 but there was no Reed in 2024 | |
possible_last_names = set(last_name_to_ids) - {"Reed"} | |
last_name_to_start: dict[str, int] = {} | |
for start_year in range(1871, 2024): | |
target_years = set(range(start_year, 2024)) | |
# the names where all the target years have someone with that last name | |
last_names_full_coverage: set[str] = set() | |
for last_name in list(possible_last_names): | |
years = last_name_to_years[last_name] | |
if years.issuperset(target_years): | |
last_names_full_coverage.add(last_name) | |
possible_last_names.remove(last_name) | |
last_name_to_start[last_name] = start_year | |
print("\n2024") | |
top = sorted(last_name_to_start.items(), key=lambda x: x[1])[:10] | |
for i, (name, year) in enumerate(top, start=1): | |
print(f"{i}. {name}: {year}") | |
""" | |
2024 | |
1. Johnson: 1907 | |
2. Miller: 1939 | |
3. Jones: 1945 | |
4. Jackson: 1950 | |
5. Smith: 1950 | |
6. Davis: 1951 | |
7. Martinez: 1962 | |
8. Perez: 1964 | |
9. Hernandez: 1965 | |
10. Rodriguez: 1967 | |
""" | |
# code gets less good after here | |
in_streak_last_name_to_ids: dict[str, set[str]] = defaultdict(set) | |
for name, start_year in last_name_to_start.items(): | |
streak = set(range(start_year, 2024)) | |
for pid in last_name_to_ids[name]: | |
played_years = player_id_to_years[pid] | |
if not played_years.isdisjoint(streak): | |
in_streak_last_name_to_ids[name].add(pid) | |
long_streaks_few_players = [] | |
for name, start_year in last_name_to_start.items(): | |
player_count = len(in_streak_last_name_to_ids[name]) | |
long_streaks_few_players.append((player_count, start_year, name)) | |
long_streaks_few_players.sort() | |
count_to_year = {} | |
for player_count, start_year, name in long_streaks_few_players: | |
if player_count not in count_to_year: | |
count_to_year[player_count] = start_year | |
keepers = [] | |
x = sorted([(count, year) for count, year in count_to_year.items()]) | |
for count, year in x: | |
if not keepers or year < keepers[-1][1]: | |
keepers.append((count, year)) | |
count_to_year_keepers = dict(keepers) | |
count_year_name = defaultdict(list) | |
for player_count, start_year, name in long_streaks_few_players: | |
if count_to_year_keepers.get(player_count) == start_year: | |
count_year_name[(player_count, start_year)].append(name) | |
for (count, start_year), names in count_year_name.items(): | |
names = ', '.join(sorted(names)) | |
print(f"{start_year}: {count} {names}") | |
""" | |
2004: 1 Greinke | |
1998: 3 Polanco | |
1997: 4 Gomes | |
1996: 6 Chavez | |
1991: 7 Hunter | |
1985: 9 Myers | |
1978: 13 Stewart | |
1973: 17 Hill | |
1967: 20 Harris | |
1950: 33 Jackson | |
1939: 51 Miller | |
1907: 108 Johnson | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment