Created
July 29, 2013 16:35
-
-
Save rlankenau/6105632 to your computer and use it in GitHub Desktop.
Pig LoadFunc for RetroSheet data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Override | |
public InputFormat getInputFormat() throws IOException { | |
return new RetrosheetInputFormat(); | |
} | |
@Override | |
public Tuple getNext() throws IOException { | |
RetrosheetPlayer[] home_players = new RetrosheetPlayer[11]; | |
RetrosheetPlayer[] away_players = new RetrosheetPlayer[11]; | |
RetrosheetPlayer[] defense = null; | |
Hashtable players = new Hashtable(); | |
int current_outs = 0; | |
int home_score = 0; | |
int away_score = 0; | |
int atbat_of_game = 0; | |
int event_of_game = 0; | |
String last_batter = ""; | |
String current_batter = ""; | |
String runner_on_first = ""; | |
String runner_on_second = ""; | |
String runner_on_third = ""; | |
System.out.println("getNext()"); | |
Tuple game = tupleFactory.newTuple(41); | |
DataBag events = bagFactory.newDefaultBag(); | |
try { | |
if(reader.nextKeyValue()) { | |
LongWritable k = (LongWritable)reader.getCurrentKey(); | |
Text v = (Text)reader.getCurrentValue(); | |
String record = v.toString(); | |
/* Tokenize based on newlines */ | |
for(String line : record.split("\n")) { | |
try { | |
String[] elems = line.split(","); | |
String linetype = elems[0].trim(); | |
if (linetype.equals("com")) { | |
/* Comment. Skip for now */ | |
} else if (linetype.equals("id")) { | |
/* ID record. If we've set the ID, this is an error */ | |
/* Raw game id */ | |
game.set(RetrosheetLoader.GAME_ID, elems[1]); | |
/* We can get the home team from the ID */ | |
game.set(RetrosheetLoader.GAME_HOME_TEAM, elems[1].substring(0,3)); | |
/* Day, Month, Year */ | |
game.set(RetrosheetLoader.GAME_DATE_DAY, Integer.parseInt(elems[1].substring(3,7))); | |
game.set(RetrosheetLoader.GAME_DATE_MONTH, Integer.parseInt(elems[1].substring(7,9))); | |
game.set(RetrosheetLoader.GAME_DATE_YEAR, Integer.parseInt(elems[1].substring(9,11))); | |
int game_of_day = Integer.parseInt(elems[1].substring(11,12)); | |
switch(game_of_day) { | |
case 0: | |
/* First game of the day, not a double header */ | |
game.set(RetrosheetLoader.GAME_OF_DAY, 1); | |
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "no"); | |
break; | |
case 1: | |
game.set(RetrosheetLoader.GAME_OF_DAY, 1); | |
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "yes"); | |
break; | |
case 2: | |
game.set(RetrosheetLoader.GAME_OF_DAY, 2); | |
game.set(RetrosheetLoader.GAME_IS_DOUBLE_HEADER, "yes"); | |
break; | |
} | |
} else if (linetype.equals("start")) { | |
/* Player start record. Add to the current players list */ | |
try { | |
RetrosheetPlayer p = new RetrosheetPlayer(elems); | |
players.put(p.player_id, p); | |
if(p.home_team) { | |
home_players[p.position] = p; | |
} else { | |
away_players[p.position] = p; | |
} | |
} catch (Exception e) { | |
/*TODO: Log this */ | |
} | |
} else if (linetype.equals("sub")) { | |
/* Player substitution. Replace the player in the list now */ | |
try { | |
RetrosheetPlayer p = new RetrosheetPlayer(elems); | |
players.put(p.player_id, p); | |
if(p.home_team) { | |
home_players[p.position] = p; | |
} else { | |
away_players[p.position] = p; | |
} | |
} catch (Exception e) { | |
/*TODO: Log this */ | |
} | |
} else if (linetype.equals("play")) { | |
try{ | |
/* Play. Emit an event into the events list, update players on base, update score. */ | |
Tuple currentPlay = tupleFactory.newTuple(39); | |
int possible_rbis = 0; | |
/* We can set event of game now. at-bat has to wait until we parse out stolen bases, etc. */ | |
event_of_game++; | |
currentPlay.set(RetrosheetLoader.PLAY_EVENT_OF_GAME, event_of_game); | |
/* Check if the batter has changed. */ | |
current_batter = elems[3].trim(); | |
RetrosheetPlayer current_player = (RetrosheetPlayer)players.get(current_batter); | |
if(current_batter != last_batter) | |
{ | |
atbat_of_game++; | |
} | |
currentPlay.set(RetrosheetLoader.PLAY_ATBAT_OF_GAME, atbat_of_game); | |
/* Set inning and whether it is top or bottom */ | |
currentPlay.set(RetrosheetLoader.PLAY_INNING, Integer.parseInt(elems[1])); | |
if(elems[2].trim() == "0"){ | |
currentPlay.set(RetrosheetLoader.PLAY_INNING_HALF, "top"); | |
defense = away_players; | |
} else { | |
currentPlay.set(RetrosheetLoader.PLAY_INNING_HALF, "bottom"); | |
defense = home_players; | |
} | |
/* Set the fielders */ | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER, defense[1].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_CATCHER, defense[2].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_FIRST_BASEMAN, defense[3].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_SECOND_BASEMAN, defense[4].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_THIRD_BASEMAN, defense[5].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_SHORTSTOP, defense[6].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_LEFTFIELDER, defense[7].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_CENTERFIELDER, defense[8].player_id); | |
currentPlay.set(RetrosheetLoader.PLAY_RIGHTFIELDER, defense[9].player_id); | |
if (defense.length >=11 && defense[10] != null && defense[10].player_id != null) | |
currentPlay.set(RetrosheetLoader.PLAY_DESIGNATED_HITTER, defense[10].player_id); | |
else | |
currentPlay.set(RetrosheetLoader.PLAY_DESIGNATED_HITTER, ""); | |
/* Set the runners on base */ | |
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_FIRST, runner_on_first); | |
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_SECOND, runner_on_second); | |
currentPlay.set(RetrosheetLoader.PLAY_RUNNER_ON_THIRD, runner_on_third); | |
int number_on_base = 0; | |
if(!runner_on_first.equals("")) | |
number_on_base++; | |
if(!runner_on_second.equals("")) | |
number_on_base++; | |
if(!runner_on_third.equals("")) | |
number_on_base++; | |
currentPlay.set(RetrosheetLoader.PLAY_RUNNERS_ON_BASE, number_on_base); | |
currentPlay.set(RetrosheetLoader.PLAY_CURRENT_BATTER, current_batter); | |
current_player.at_bat_number++; | |
currentPlay.set(RetrosheetLoader.PLAY_CURRENT_BATTER_AT_BAT, current_player.at_bat_number); | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_POSITION, current_player.position); | |
try { | |
int count = Integer.parseInt(elems[4]); | |
currentPlay.set(RetrosheetLoader.PLAY_COUNT, count/10 + "-" + count%10); | |
} catch (Exception e) { | |
currentPlay.set(RetrosheetLoader.PLAY_COUNT, "Unknown"); | |
} | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_HITS_SO_FAR, current_player.hits_so_far); | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_HBP_SO_FAR, current_player.hbp_so_far); | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_WALKS_SO_FAR, current_player.walks_so_far); | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_OUTS_SO_FAR, current_player.outs_so_far); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_BATTERS_PITCHED_TO, defense[1].batters_pitched_to); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_HITS_ALLOWED, defense[1].pitcher_hits_allowed); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_WALKS_ALLOWED, defense[1].pitcher_walks_allowed); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_WILD_PITCHES, defense[1].pitcher_wild_pitches); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_BATTERS_BEANED, defense[1].pitcher_beans); | |
currentPlay.set(RetrosheetLoader.PLAY_PITCHER_STRIKEOUTS, defense[1].pitcher_strikeouts); | |
currentPlay.set(RetrosheetLoader.PLAY_HOME_SCORE, home_score); | |
currentPlay.set(RetrosheetLoader.PLAY_AWAY_SCORE, away_score); | |
/* Parse the event itself */ | |
Matcher m = event_pattern.matcher(elems[6]); | |
if(m.matches() != true) { | |
System.err.println("Couldn't parse event data: " + elems[6]); | |
} else { | |
/* Figure out player movement so we can update everything in order */ | |
if(m.groupCount() >= 6 && m.group(5) != null && !m.group(5).equals("")) { | |
/* We have some player movement */ | |
String[] runner_mvmt = m.group(5).split(";"); | |
/* Scan the whole thing in case the movement is out of order */ | |
for(int i=3;i>0;i--) { | |
for(int j=0;j<runner_mvmt.length;j++) { | |
if(runner_mvmt[j].startsWith(""+i)) { | |
/* Check if this is movement or an out. */ | |
if(runner_mvmt[j].substring(1,2).equals("X")) { | |
/* Clear the runner */ | |
current_outs++; | |
switch(i) { | |
case 1: | |
runner_on_first = ""; | |
break; | |
case 2: | |
runner_on_second = ""; | |
break; | |
case 3: | |
runner_on_third = ""; | |
break; | |
} | |
} else if (runner_mvmt[j].substring(1,2).equals("-")) { | |
String newbasename = runner_mvmt[j].substring(2,3); | |
if(newbasename.equals("H")) { | |
/* Can't credit an RBI yet. Save as conditional RBI */ | |
possible_rbis++; | |
if(current_player.home_team) { | |
home_score++; | |
} else { | |
away_score++; | |
} | |
switch(i) { | |
case 1: | |
runner_on_first = ""; | |
break; | |
case 2: | |
runner_on_second = ""; | |
break; | |
case 3: | |
runner_on_third = ""; | |
break; | |
} | |
} else { | |
int newbase = Integer.parseInt(runner_mvmt[j].substring(2,3)); | |
String moving_runner = ""; | |
switch(i) { | |
case 1: | |
moving_runner = runner_on_first; | |
runner_on_first = ""; | |
break; | |
case 2: | |
moving_runner = runner_on_second; | |
runner_on_second = ""; | |
break; | |
case 3: | |
moving_runner = runner_on_third; | |
runner_on_third = ""; | |
break; | |
} | |
switch(newbase) { | |
case 1: | |
runner_on_first = moving_runner; | |
break; | |
case 2: | |
runner_on_second = moving_runner; | |
break; | |
case 3: | |
runner_on_second = moving_runner; | |
break; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
currentPlay.set(RetrosheetLoader.PLAY_BATTER_RBIS, current_player.rbis); | |
if(m.group(1) != null) { | |
if(m.group(1).equals("S")) { | |
runner_on_first = current_batter; | |
defense[1].pitcher_hits_allowed++; | |
current_player.rbis+=possible_rbis; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Single"); | |
} else if (m.group(1).equals("D") || m.group(1).equals("DGR")) { | |
runner_on_second = current_batter; | |
defense[1].pitcher_hits_allowed++; | |
current_player.rbis+=possible_rbis; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Double"); | |
} else if (m.group(1).equals("T")) { | |
runner_on_third = current_batter; | |
defense[1].pitcher_hits_allowed++; | |
current_player.rbis+=possible_rbis; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Triple"); | |
} else if (m.group(1).equals("HR")) { | |
if(current_player.home_team) { | |
home_score++; | |
} else { | |
away_score++; | |
} | |
defense[1].pitcher_hits_allowed++; | |
current_player.rbis+=possible_rbis; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Home run"); | |
} else if (m.group(1).equals("HP")) { | |
runner_on_first = current_batter; | |
defense[1].pitcher_beans++; | |
current_player.hbp_so_far++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Hit by pitch"); | |
} else if (m.group(1).equals("WP")) { | |
defense[1].pitcher_wild_pitches++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Wild pitch"); | |
} else if (m.group(1).equals("W") || m.group(1).equals("IW")) { | |
runner_on_first = current_batter; | |
defense[1].pitcher_walks_allowed++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Walk"); | |
} else if (m.group(1).equals("K")) { | |
defense[1].pitcher_strikeouts++; | |
current_player.strikeouts_so_far++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Strikeout"); | |
} else if (m.group(1).equals("NP")) { | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "No Play"); | |
} else if (m.group(1).equals("")) { | |
/* Out */ | |
current_player.outs_so_far++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Out"); | |
} | |
if (m.group(2) != null) | |
// This is fielder - 1-9 | |
currentPlay.set(RetrosheetLoader.PLAY_FIELDER, m.group(2)); | |
else | |
currentPlay.set(RetrosheetLoader.PLAY_FIELDER,"0"); | |
if (m.group(3) != null) { | |
// This is type of ball hit: | |
// "L" line drive, "G" grounder,etc | |
switch(m.group(3).charAt(0)) { | |
case 'L': currentPlay.set(PLAY_TRAJECTORY,"Line drive"); break; | |
case 'F': currentPlay.set(PLAY_TRAJECTORY,"Fly ball"); break; | |
case 'G': currentPlay.set(PLAY_TRAJECTORY,"Grounder"); break; | |
case 'P': currentPlay.set(PLAY_TRAJECTORY,"Pop fly"); break; | |
case 'B': currentPlay.set(PLAY_TRAJECTORY,"Bunt"); break; | |
default: currentPlay.set(PLAY_TRAJECTORY,m.group(3)); | |
} | |
} | |
else | |
currentPlay.set(PLAY_TRAJECTORY,"0"); | |
} else { | |
/* Out */ | |
current_player.outs_so_far++; | |
currentPlay.set(RetrosheetLoader.PLAY_RESULT, "Out"); | |
} | |
/* Write out rbis and rbis_so_far. */ | |
currentPlay.set(RetrosheetLoader.PLAY_RBIS_ON_PLAY, possible_rbis); | |
} | |
events.add(currentPlay); | |
} catch (Exception e) { | |
System.err.println("Error with play: " + e); | |
e.printStackTrace(); | |
} | |
} else if (linetype.equals("version")) { | |
/* File version info. Skip for now */ | |
} else if (linetype.equals("info")) { | |
/* Game info. Add to the output tuple */ | |
String infotype = elems[1].trim(); | |
if (infotype.equals("hometeam")) { | |
/* Ignore, this is already set by ID */ | |
} else if (infotype.equals("site")) { | |
game.set(RetrosheetLoader.GAME_SITE, elems[2]); | |
} else if (infotype.equals("date")) { | |
/* Already set in the ID */ | |
} else if (infotype.equals("number")) { | |
/* Already set in the ID */ | |
} else if (infotype.equals("daynight")) { | |
game.set(RetrosheetLoader.GAME_DAY_NIGHT, elems[2]); | |
} else if (infotype.equals("starttime")) { | |
String[] time_elems = elems[2].split(":"); | |
int hour = 0, minutes = 0; | |
if(time_elems.length == 2) { | |
hour = Integer.parseInt(time_elems[0]); | |
minutes = Integer.parseInt(time_elems[1].substring(0,2)); | |
if(!time_elems[1].substring(2,4).equals("AM")) { | |
hour+=12; | |
} | |
} else { | |
if(elems[2].length() > 2) { | |
int length = elems[2].length(); | |
minutes = Integer.parseInt(elems[2].substring(length-2, length)); | |
hour = Integer.parseInt(elems[2].substring(0, length-2)); | |
} | |
} | |
game.set(RetrosheetLoader.GAME_START_HOUR, hour); | |
game.set(RetrosheetLoader.GAME_START_MINUTES, minutes); | |
} else if( infotype.equals("visteam")){ | |
game.set(RetrosheetLoader.GAME_AWAY_TEAM, elems[2]); | |
} else if (infotype.equals("usedh")) { | |
game.set(RetrosheetLoader.GAME_USE_DESIGNATED_HITTER, elems[2]); | |
} else if (infotype.equals("umphome")) { | |
game.set(RetrosheetLoader.GAME_HOME_UMPIRE, elems[2]); | |
} else if (infotype.equals("ump1b")) { | |
game.set(RetrosheetLoader.GAME_1ST_BASE_UMPIRE, elems[2]); | |
} else if (infotype.equals("ump2b")) { | |
game.set(RetrosheetLoader.GAME_2ND_BASE_UMPIRE, elems[2]); | |
} else if (infotype.equals("ump3b")) { | |
game.set(RetrosheetLoader.GAME_3RD_BASE_UMPIRE, elems[2]); | |
} else if (infotype.equals("umplf")) { | |
game.set(RetrosheetLoader.GAME_LEFT_FIELD_UMPIRE, elems[2]); | |
} else if (infotype.equals("umprf")) { | |
game.set(RetrosheetLoader.GAME_RIGHT_FIELD_UMPIRE, elems[2]); | |
} else if (infotype.equals("wp")) { | |
game.set(RetrosheetLoader.GAME_WINNING_PITCHER, elems[2]); | |
} else if (infotype.equals("lp")) { | |
game.set(RetrosheetLoader.GAME_LOSING_PITCHER, elems[2]); | |
} else if (infotype.equals("howscored")) { | |
game.set(RetrosheetLoader.GAME_HOW_SCORED, elems[2]); | |
} else if (infotype.equals("scorer")) { | |
game.set(RetrosheetLoader.GAME_SCORER, elems[2]); | |
} else if (infotype.equals("inputter")) { | |
game.set(RetrosheetLoader.GAME_INPUTTER, elems[2]); | |
} else if (infotype.equals("translator")) { | |
game.set(RetrosheetLoader.GAME_TRANSLATOR, elems[2]); | |
} else if (infotype.equals("pitches")) { | |
game.set(RetrosheetLoader.GAME_HAS_PITCHES, elems[2]); | |
} else if (infotype.equals("winddir")) { | |
game.set(RetrosheetLoader.GAME_WIND_DIRECTION, elems[2]); | |
} else if (infotype.equals("windspeed")) { | |
game.set(RetrosheetLoader.GAME_WIND_SPEED, Integer.parseInt(elems[2])); | |
} else if (infotype.equals("temp")) { | |
game.set(RetrosheetLoader.GAME_TEMPERATURE, Integer.parseInt(elems[2])); | |
} else if (infotype.equals("sky")) { | |
game.set(RetrosheetLoader.GAME_SKY_CONDITION, elems[2]); | |
} else if (infotype.equals("fieldcond")) { | |
game.set(RetrosheetLoader.GAME_FIELD_CONDITION, elems[2]); | |
} else if (infotype.equals("precip")) { | |
game.set(RetrosheetLoader.GAME_PRECIPITATION, elems[2]); | |
} else if (infotype.equals("attendance")) { | |
game.set(RetrosheetLoader.GAME_ATTENDANCE, Integer.parseInt(elems[2])); | |
} else if (infotype.equals("timeofgame")) { | |
game.set(RetrosheetLoader.GAME_DURATION, Integer.parseInt(elems[2])); | |
} else if (infotype.equals("save")) { | |
game.set(RetrosheetLoader.GAME_COUNTED_AS_SAVE, elems[2]); | |
} | |
} else if (linetype.equals("data")) { | |
/* Other game data. Generally earned runs for the pitchers */ | |
} | |
} catch (Exception e) { | |
System.err.println("Malformed data: '" + line + "' exception: " + e); | |
} | |
} | |
game.set(RetrosheetLoader.GAME_EVENTS, events); | |
game.set(RetrosheetLoader.GAME_FINAL_HOME_SCORE, home_score); | |
game.set(RetrosheetLoader.GAME_FINAL_AWAY_SCORE, away_score); | |
game.set(RetrosheetLoader.GAME_EVENTS_IN_GAME, event_of_game); | |
game.set(RetrosheetLoader.GAME_BATTERS_IN_GAME, atbat_of_game); | |
if(home_score>away_score) { | |
game.set(RetrosheetLoader.GAME_WINNER, game.get(RetrosheetLoader.GAME_HOME_TEAM)); | |
} else if (away_score>home_score) { | |
game.set(RetrosheetLoader.GAME_WINNER, game.get(RetrosheetLoader.GAME_AWAY_TEAM)); | |
} | |
return game; | |
} | |
} catch (Exception e) { | |
/*TODO: Log this */ | |
throw new IOException("Error parsing", e); | |
} | |
return null; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment