Skip to content

Instantly share code, notes, and snippets.

@travisjungroth
Last active February 5, 2025 19:04
Show Gist options
  • Save travisjungroth/0c2c7c6ec46f7747a470ee42ba977c48 to your computer and use it in GitHub Desktop.
Save travisjungroth/0c2c7c6ec46f7747a470ee42ba977c48 to your computer and use it in GitHub Desktop.
What last name has appeared in the MLB each year for the longest time?
from collections import defaultdict
import pandas as pd
# Data is from seanlahman.com
people = pd.read_csv(
"People.csv",
encoding="latin-1",
low_memory=False
)
appearances = pd.read_csv(
"Appearances.csv",
encoding="latin-1",
low_memory=False
)
player_id_to_last_name: dict[str, str] = {}
last_name_to_ids: defaultdict[str, set[str]] = defaultdict(set)
for _, row in people.iterrows():
pid = row["playerID"]
last_name = row["nameLast"]
player_id_to_last_name[pid] = last_name
last_name_to_ids[last_name].add(pid)
player_id_to_years: defaultdict[str, set[int]] = defaultdict(set)
last_name_to_years: defaultdict[str, set[int]] = defaultdict(set)
for _, row in appearances.iterrows():
pid = row["playerID"]
year = int(row["yearID"])
player_id_to_years[pid].add(year)
last_name = player_id_to_last_name.get(pid)
last_name_to_years[last_name].add(year)
possible_last_names = set(last_name_to_ids)
last_name_to_start: dict[str, int] = {}
for start_year in range(1871, 2024):
target_years = set(range(start_year, 2024))
# the names where all the target years have someone with that last name
last_names_full_coverage: set[str] = set()
for last_name in list(possible_last_names):
years = last_name_to_years[last_name]
if years.issuperset(target_years):
last_names_full_coverage.add(last_name)
possible_last_names.remove(last_name)
last_name_to_start[last_name] = start_year
print("2023")
top = sorted(last_name_to_start.items(), key=lambda x: x[1])[:10]
for i, (name, year) in enumerate(top, start=1):
print(f"{i}. {name}: {year}")
"""
2023
1. Johnson: 1907
2. Miller: 1939
3. Jones: 1945
4. Jackson: 1950
5. Smith: 1950
6. Davis: 1951
7. Reed: 1958
8. Martinez: 1962
9. Perez: 1964
10. Hernandez: 1965
"""
# Redo it and remove Reed, since that was a top 10 but there was no Reed in 2024
possible_last_names = set(last_name_to_ids) - {"Reed"}
last_name_to_start: dict[str, int] = {}
for start_year in range(1871, 2024):
target_years = set(range(start_year, 2024))
# the names where all the target years have someone with that last name
last_names_full_coverage: set[str] = set()
for last_name in list(possible_last_names):
years = last_name_to_years[last_name]
if years.issuperset(target_years):
last_names_full_coverage.add(last_name)
possible_last_names.remove(last_name)
last_name_to_start[last_name] = start_year
print("\n2024")
top = sorted(last_name_to_start.items(), key=lambda x: x[1])[:10]
for i, (name, year) in enumerate(top, start=1):
print(f"{i}. {name}: {year}")
"""
2024
1. Johnson: 1907
2. Miller: 1939
3. Jones: 1945
4. Jackson: 1950
5. Smith: 1950
6. Davis: 1951
7. Martinez: 1962
8. Perez: 1964
9. Hernandez: 1965
10. Rodriguez: 1967
"""
# code gets less good after here
in_streak_last_name_to_ids: dict[str, set[str]] = defaultdict(set)
for name, start_year in last_name_to_start.items():
streak = set(range(start_year, 2024))
for pid in last_name_to_ids[name]:
played_years = player_id_to_years[pid]
if not played_years.isdisjoint(streak):
in_streak_last_name_to_ids[name].add(pid)
long_streaks_few_players = []
for name, start_year in last_name_to_start.items():
player_count = len(in_streak_last_name_to_ids[name])
long_streaks_few_players.append((player_count, start_year, name))
long_streaks_few_players.sort()
count_to_year = {}
for player_count, start_year, name in long_streaks_few_players:
if player_count not in count_to_year:
count_to_year[player_count] = start_year
keepers = []
x = sorted([(count, year) for count, year in count_to_year.items()])
for count, year in x:
if not keepers or year < keepers[-1][1]:
keepers.append((count, year))
count_to_year_keepers = dict(keepers)
count_year_name = defaultdict(list)
for player_count, start_year, name in long_streaks_few_players:
if count_to_year_keepers.get(player_count) == start_year:
count_year_name[(player_count, start_year)].append(name)
for (count, start_year), names in count_year_name.items():
names = ', '.join(sorted(names))
print(f"{start_year}: {count} {names}")
"""
2004: 1 Greinke
1998: 3 Polanco
1997: 4 Gomes
1996: 6 Chavez
1991: 7 Hunter
1985: 9 Myers
1978: 13 Stewart
1973: 17 Hill
1967: 20 Harris
1950: 33 Jackson
1939: 51 Miller
1907: 108 Johnson
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment