Created
April 19, 2013 06:48
-
-
Save natarajanc/5418566 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# <nbformat>3.0</nbformat> | |
# <headingcell level=2> | |
# Soccer 101 - Player Positions | |
# <codecell> | |
from IPython.core.display import Image | |
Image('http://www.soccer-training-guide.com/images/wikipedia_positions.png') | |
# <headingcell level=3> | |
# Parse Players data | |
# <codecell> | |
def get_players_data(players, rows): | |
'''helper function to parse player data on who were transferred in the English premier league ''' | |
for row in rows: | |
players_data = { } | |
for (i,item) in enumerate(row.find_all('td')): | |
#print i, item.get_text() | |
if i == 3: | |
players_data['name'] = item.get_text() | |
parts = item.find('a').get('href').split('/') | |
players_data['handle'] = parts[2] | |
players_data['id'] = parts[4].split('_')[1].split('.')[0] | |
if i == 4: | |
players_data['club'] = item.get_text() | |
if i == 5: | |
players_data['age'] = item.get_text() | |
if i == 7: | |
players_data['position'] = item.get_text() | |
if i == 9: | |
players_data['from'] = item.get_text() | |
if i == 11: | |
players_data['to'] = item.get_text() | |
if i == 12: | |
fee = item.get_text().split(' ') | |
players_data['transfer_fee_pounds'] = fee[0] | |
players_data['transfer_fee_euros'] = fee[1][1:] | |
players.append(players_data) | |
# <headingcell level=3> | |
# Fetch Players data (transfers in a given year) | |
# <codecell> | |
def fetch_data(players, year): | |
''' given a particular year , fetch transferred player data from English premier league''' | |
url = 'http://www.transfermarkt.co.uk/en/premier-league/transferrekorde/wettbewerb_GB1_%s_default_default_default_alle.html' % (year) | |
r = requests.get(url) | |
html_doc = r.text | |
soup = BeautifulSoup(html_doc) | |
table= soup.find('table' , {'class':"tabelle_grafik"}) | |
rows = table.find_all('tr' , {'class':"hell"}) | |
get_players_data(players, rows) | |
rows = table.find_all('tr' , {'class':"dunkel"}) | |
get_players_data(players, rows[1:]) | |
# <headingcell level=2> | |
# Get player performance data for a particular season in EPL | |
# <codecell> | |
def extend_data(players, new_players, year): | |
for player in players: | |
url = 'http://www.transfermarkt.co.uk/en/%s/leistungsdaten/spieler_%s_%s.html' % (player['handle'], player['id'], year) | |
r = requests.get(url) | |
html_doc = r.text | |
soup = BeautifulSoup(html_doc) | |
table = soup.find('table' , {'class':"standard_tabelle"}) | |
row = table.find('tr' , {'class':"hell"}) #Assuming its in first row | |
for (i,item) in enumerate(row.find_all('td')): | |
#print i, item.get_text() | |
if i == 2: | |
if item.get_text() != '-': | |
player['matches'] = item.get_text() | |
else: | |
player['matches'] = 0 | |
if i == 3: | |
if item.get_text() != '-': | |
player['goals'] = item.get_text() | |
else: | |
player['goals'] = 0 | |
if i == 5: | |
if item.get_text() != '-': | |
player['assists'] = item.get_text() | |
else: | |
player['assists'] = 0 | |
if i == 11: | |
if item.get_text() != '-': | |
player['minutes_per_goal'] = item.get_text() | |
else: | |
player['minutes_per_goal'] = 0 | |
if i == 12: | |
if item.get_text() != '-': | |
player['minutes'] = item.get_text() | |
else: | |
player['minutes'] = 0 | |
new_players.append(player) | |
# <headingcell level=2> | |
# Test code to dump player data by season onto files | |
# <codecell> | |
import pickle,requests | |
from bs4 import BeautifulSoup | |
import requests , re | |
players = [] | |
year = '2010' | |
fetch_data(players, year) | |
new_players = [] | |
extend_data(players, new_players, year) | |
print new_players[0] | |
#pickle.dump( new_players, open( "players_data_2010.pkl", "wb" ) ) | |
# <headingcell level=2> | |
# Load players dataframe | |
# <codecell> | |
import numpy as np | |
import pandas as pd | |
import pickle,matplotlib.pyplot as plt | |
from pandas import Series, DataFrame | |
f = open('players_data_2010.pkl') | |
p = pickle.load(f) | |
player_df_2010 = DataFrame(p) | |
player_df_2010['season']='10/11' | |
f = open('players_data_2012.pkl') | |
p = pickle.load(f) | |
player_df_2012 = DataFrame(p) | |
player_df_2012['season']='12/13' | |
f = open('players_data.pkl') | |
p = pickle.load(f) | |
player_df_2011 = DataFrame(p) | |
player_df_2011['season']='11/12' | |
print player_df_2011.columns | |
player_df = pd.concat([player_df_2010,player_df_2011,player_df_2012]) | |
player_df['transfer_fee_euros'] = player_df.transfer_fee_euros.str.replace('.','') | |
player_df['transfer_fee_euros'] =player_df.transfer_fee_euros.astype(float64) | |
player_df['minutes'] = player_df.minutes.astype(float64) | |
player_df['goals'] = player_df.goals.astype(float64) | |
player_df['age'] = player_df.age.astype(float64) | |
player_df['assists'] = player_df.assists.astype(float64) | |
player_df.set_index(['id'], inplace=True, drop=False) | |
player_df['money_by_minutes']= player_df['transfer_fee_euros']/player_df['minutes'].astype(float64) | |
player_df['money_by_goal_minutes'] = player_df['transfer_fee_euros']/player_df['minutes_per_goal'].astype(float64) | |
player_df['money_by_assists'] = player_df['transfer_fee_euros']/player_df['assists'].astype(float64) | |
player_df['money_by_goals'] = player_df['transfer_fee_euros']/player_df['goals'].astype(float64) | |
player_df['money_by_matches'] = player_df['transfer_fee_euros']/player_df['matches'].astype(float64) | |
epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom'] | |
epl_to=[] | |
for team in epl_teams: | |
epl_to.append( player_df[player_df.to == team]) | |
player_df = pd.concat(epl_to) | |
print len(player_df) | |
player_df = player_df.sort_index(by='season') | |
print player_df | |
# get unique players who have been transferred more than once across seasons from 2010 to 2013 | |
transfr_more_than_once = player_df.ix[player_df.id.value_counts() > 1 ].sort_index(by=['id','season'])['id'].unique() | |
# list of all players who have been transferred more than once from 2010 -2013 | |
#print player_df.ix[transfr_more_than_once].sort_index(by=['id','season'])[['name','from','to','transfer_fee_euros','season']] | |
#[['name','position','from','to','transfer_fee_euros','season']] | |
# graph player transfer amount rise/fall through the 3 seasons. | |
def transfr_amt_by_season(p_id): | |
return player_df[(player_df.id==p_id)][['season','transfer_fee_euros']].set_index(keys='season') | |
name_df = DataFrame(index=['10/11','11/12','12/13']) | |
for item in np.unique(transfr_more_than_once): | |
#print item | |
#print np.unique(player_df.ix[item]['name']) | |
name_df[np.unique(player_df.ix[item]['name'])] = transfr_amt_by_season(item) | |
name_df= name_df.where(pd.notnull(name_df),None) | |
#print name_df | |
#print name_df.plot(figsize=(10,7)) | |
######### | |
#Money spent each season by clubs on player acquisitions | |
grouped = player_df[player_df.season=='11/12'].groupby(['to']) | |
# money paid per minute on field time | |
#(grouped['transfer_fee_euros'].sum()/ grouped['minutes'].sum()).plot(figsize=(10,10) ,kind='barh',rot=0) | |
# curious observation for Loko Moscow - seems they paid their player 9mil and he played for only 131 minutes | |
# print player_df.ix[grouped.groups['Loko Moscow']][['season','name','to','transfer_fee_euros','minutes']] | |
#print grouped.aggregate(np.sum)[['money_by_minutes']].plot() | |
#print grouped['money_by_minutes'].mean().plot(figsize(10,10),kind='barh' , rot=0) | |
#grouped.groups.plot() | |
#### | |
def compare_stat(col): | |
''' takes column name from players_df dataframe and computes transfr_amount/col name ''' | |
#col = 'minutes' | |
#Money spent each season by clubs on player acquisitions | |
grouped = player_df[player_df.season=='10/11'].groupby(['to']) | |
# money paid per minute on field time | |
t1 = (grouped['transfer_fee_euros'].sum()/ grouped[col].sum()) | |
t1.name = '10/11' | |
t1 = DataFrame(t1) | |
grouped = player_df[player_df.season=='11/12'].groupby(['to']) | |
t2 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum() | |
t2.name='11/12' | |
grouped = player_df[player_df.season=='12/13'].groupby(['to']) | |
t3 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum() | |
t3.name='12/13' | |
t1= t1.join(t2) | |
t1= t1.join(t3) | |
t1= t1.where(pd.notnull(t1),0) | |
#print t1 | |
fig = plt.figure() | |
fig.set_size_inches(20,20) | |
ax1 = fig.add_subplot(2, 2, 1) | |
ax2 = fig.add_subplot(2, 2, 2) | |
ax3 = fig.add_subplot(2, 2, 3) | |
i=1 | |
for seasons in t1.columns: | |
print seasons | |
x = fig.add_subplot(2,2,i) | |
t1[seasons].plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0) | |
i+=1 | |
fig | |
t1= t1.transpose() | |
t1.plot(figsize(15,15)) | |
#for seasons in | |
#t1['10/11'].plot(figsize(10,10),kind='barh', rot=0) | |
#print player_df.ix['22491'] # 5 million paid by man utd , on bench for 2 matches - easy money! | |
compare_stat('minutes') | |
compare_stat('goals') | |
grouped = player_df[player_df.season=='11/12'].groupby(['to','age']) | |
#print grouped.groups | |
#print player_df.ix['4063'] | |
(grouped['transfer_fee_euros'].sum()/ grouped['goals'].sum()).plot(figsize=(10,10) ,kind='barh',rot=0) | |
#name_df['Raul'] = name_sex_count_in_year('Raymond','M') | |
#player_df[player_df[player_df.id.value_counts() > 1].isin([True, False])] | |
#player_df['name'][player_df.groupby('id').size() > 1] | |
# <codecell> | |
import pickle | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from pandas import Series, DataFrame | |
#player_df.transfer_fee_euros.type | |
#player_df.position | |
print len(player_df) | |
player_df.sort_index(by='transfer_fee_euros')[::][["name", "club",'transfer_fee_euros','from','to']] | |
grouped= player_df.groupby('club').size() | |
#grouped.order(ascending=True).plot() | |
club_counts = player_df['club'].value_counts() | |
#print club_counts | |
#club_counts.plot(kind='barh', rot=0) | |
pos_counts = player_df['position'].value_counts() # players acquired by position - raw count | |
print pos_counts | |
#player_df.groupby('position')['transfer_fee_euros'].sum().plot(title='total price by position') | |
position_compare = DataFrame() | |
#print player_df.season.unique() | |
pieces = DataFrame() | |
for season in player_df.season.unique(): | |
p1 = player_df[player_df.season==season].groupby(['position'])['transfer_fee_euros'].mean() | |
p1.name = season | |
if season == '10/11': | |
position_compare = DataFrame(p1) | |
else: | |
position_compare = position_compare.join(p1) | |
position_compare= position_compare.where(pd.notnull(position_compare),0) | |
#print position_compare | |
print position_compare.plot(kind='barh') | |
#player_df[player_df.season=='10/11'].groupby(['position'])['transfer_fee_euros'].mean().plot(title='avg price by position' , kind='barh', rot=0) # shows high price paid for Strikers CF | |
#pos_counts.plot(kind='barh', rot=0) | |
#player_df['transfer_fee_euros'].hist() # distribution of money spent | |
# <codecell> | |
player_df.groupby('to').groups.keys() | |
# <codecell> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment