Last active
October 22, 2020 09:35
-
-
Save janpipek/75376fe4f202821d6023113b409d4293 to your computer and use it in GitHub Desktop.
opakovaci_ukoly
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Spočítejte a nakreslete graf, jak se v průběhu let vyvíjel | |
# celkový počet udělených medailí (zvlášť pro zimní a letní hry). | |
import pandas as pd | |
import seaborn as sns | |
%matplotlib inline | |
olympics = pd.read_csv("athlete_events.csv") | |
medals = olympics.groupby(["Year", "Season"], as_index=False)["Medal"].count() | |
sns.lineplot("Year", "Medal", data=medals, hue="Season") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 2. Jak se graf změní, když budeme počítat sady medailí? | |
# Návodná otázka: Kolik (a jaké) zlatých medailí získala ČR v roce 1998? | |
import pandas as pd | |
import seaborn as sns | |
olympics = pd.read_csv("athlete_events.csv") | |
olympics[~olympics.Medal.isna() & (olympics.NOC == "CZE") & (olympics.Year == 1998)] | |
# Nagano - asi dvacet zlatých medailí! | |
events = olympics.groupby(["Year", "Season", "Sport", "Event"], as_index=False).count() | |
event_count = events.groupby(["Year", "Season"], as_index=False)["Medal"].count() | |
sns.lineplot("Year", "Medal", data=event_count, hue="Season") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 3. Zjistěte jména deseti regionů (ne nutně zemí, viz noc_regions.csv), které získaly absolutně nejvyšší počet medailí za historii olympijských her? | |
# Bonus: Přidejte váhu jednotlivým kovům (3=zlato, 2=stříbro, 1=bronz). | |
import pandas as pd | |
import seaborn as sns | |
olympics = pd.read_csv("athlete_events.csv") | |
regions = pd.read_csv("noc_regions.csv") | |
medal_vs_country = (olympics | |
.groupby(["Year", "Season", "Sport", "Event", "NOC", "Medal"], as_index=False) | |
.size() | |
.reset_index() | |
.drop(0, axis=1) | |
) | |
medal_vs_country | |
noc_medals = ( | |
medal_vs_country | |
.groupby(["NOC", "Medal"]) | |
.size() | |
.unstack("Medal", fill_value=0) | |
# .fillna(0) | |
.astype("int") | |
.assign( | |
Points = lambda df: df["Bronze"] * 1 + df["Silver"] * 2 + df["Gold"] * 3 | |
) | |
)[["Gold", "Silver", "Bronze", "Points"]].reset_index() | |
pd.merge(regions, noc_medals, on="NOC").sort_values("Points", ascending=False).drop(["notes", "NOC"], axis=1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 4. Najděte nejčastější první (obvykle křestní) jméno sportovce za každý region. | |
# (První část řetězce před mezerou). | |
import pandas as pd | |
olympics = pd.read_csv("athlete_events.csv") | |
name_counts = ( | |
olympics.assign( | |
first_name = olympics["Name"].str.split(" ", 1).apply(lambda val: val[0]) | |
).groupby(["NOC", "first_name"]).size().reset_index() | |
) | |
name_counts.sort_values(["NOC", 0], ascending=(True, False)).groupby("NOC").first() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 5. Pomocí qcut rozdělte účastníky do 4 věkových skupin a pak se podívejte na minimální, | |
# průměrnou a maximální výšku a váhu účastníků (pro každé pohlaví zvlášť). | |
import pandas as pd | |
olympics = pd.read_csv("athlete_events.csv") | |
age_groups, age_bins = pd.qcut(olympics["Age"], 4, retbins=True) | |
olympics.groupby([age_groups, "Sex"]).agg({"Height": ["max", "min", "mean"], "Weight": ["max", "min", "mean"]}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 6. Podívejte se (graficky) na rozdělení dvou proměnných pro výšku, váhu a věk, | |
# odhadněte, co a jak spolu koreluje, a ověřte výpočtem. | |
import pandas as pd | |
import seaborn as sns | |
olympics = pd.read_csv("athlete_events.csv") | |
sns.pairplot(olympics[["Age", "Height", "Weight"]]) | |
olympics[["Age", "Height", "Weight"]].corr() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 7. Je Ledecká opravdu první sportovkyní, která na ZOH v jednom roce vyhrála zlato ve dvou různých sportech? | |
# Co muži? Co sportovkyně a sportovci na LOH? | |
number_of_sports_with_medal = olympics.query("Medal == 'Gold'").groupby(["Year", "Season", "Name", "Age", "Sport", "Sex"]).size().groupby(["Year", "Season", "Name", "Age", "Sex"]).size() | |
number_of_sports_with_medal[number_of_sports_with_medal > 1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 8. Jaká je pravděpodobnost, že si účastník odveze medaili, v závislosti na tom, kolikátá to pro něj je olympiáda? | |
import pandas as pd | |
olympics = pd.read_csv("athlete_events.csv") | |
name_vs_year = ( | |
olympics | |
.groupby(["ID", "Season", "Year"], sort=True) | |
.size() | |
.reset_index() | |
) | |
name_vs_year_rank = ( | |
name_vs_year | |
.groupby(["ID", "Season"])["Year"] | |
.rank() | |
.rename("kolikata") | |
.astype("int") | |
) | |
name_vs_year_rank = name_vs_year.join(name_vs_year_rank).drop(0, axis=1) | |
olympics_with_kolikata = pd.merge(olympics, name_vs_year_rank, on=(["ID", "Season", "Year"])) | |
by_kolikata = olympics_with_kolikata.join(pd.get_dummies(olympics_with_kolikata["Medal"])).groupby("kolikata") | |
by_kolikata[["Bronze", "Silver", "Gold"]].agg(["sum", "mean"]).assign(count=by_kolikata.size()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 9. Kolik sportovců získalo medaili v jiném století, než se narodili? | |
import pandas as pd | |
olympics = pd.read_csv("athlete_events.csv") | |
centuries = olympics.dropna(subset=["Medal"]).assign( | |
birth_century = (olympics["Year"] - olympics["Age"] + 99) // 100, | |
olympics_century = (olympics["Year"] + 99) // 100, | |
).drop_duplicates(subset=["ID"]) | |
pd.crosstab(centuries["birth_century"], centuries["olympics_century"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment