Created
August 17, 2018 18:09
-
-
Save Goblin80/2759636c76b0072c80e75e4275adef1b to your computer and use it in GitHub Desktop.
MapReduce program to calculate the conditional probability that a word q occurs after another word p
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
hadoop fs -rm -R shakespeare | |
hadoop fs -mkdir -p shakespeare/input | |
hadoop fs -put input/* shakespeare/input | |
hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.13.0.jar -input shakespeare/input -output shakespeare/output -mapper "python `pwd`/mapper.py" -reducer "python `pwd`/reducer.py" | |
hadoop fs -cat shakespeare/output/part* | head -n 10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
divider = '@' | |
p = q = None | |
def display(a, b): | |
print '%s\t%d' % (b, 1) | |
if a: | |
print '%s%c%s\t%d' % (a, divider, b, 1) | |
for line in sys.stdin: | |
for word in re.findall(r'\w{3,}', line): | |
q = word.lower() | |
display(p, q) | |
p = q |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
divider = '@' | |
p = None | |
freq = {} | |
for line in sys.stdin: | |
p, c = line.split('\t') | |
c = int(c) | |
try: | |
freq[p] += c | |
except: | |
freq[p] = c | |
bigram = {} | |
for x in freq: | |
if(divider in x): | |
l, r = x.split(divider) | |
bigram[x] = freq[x] / (freq[l] * 1.0) | |
bigram = sorted(bigram.items(), key=lambda x:x[1], reverse=True) | |
for x in bigram: | |
print '%s\t%f' % x | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment