Last active
August 29, 2015 14:24
-
-
Save albert-decatur/5636028ef407fed7f78b to your computer and use it in GitHub Desktop.
magic stopword removal from road names
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels | |
# first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams | |
# then we build sed commands around these stopwords, using word boundaries (eg \bRd\b) | |
# and we ask for a case-insensitive match (eg sed 's/pattern//I') | |
# then we string them together with pipes | |
# and cat the file of interst and eval all the sed commands! | |
# this removes the stopword list we build, with case-insensitive match, and using word boundaries | |
# advantage over sed OR (eg sed 's/Rd\|St//I') is that that can only do some many at a time | |
# would be more efficient of course to batch these into groups of *n* | |
rm_stopwords=$( | |
n=6 | |
ngrams tiger.csv 1 |\ | |
sortfreq |\ | |
sed '1d' |\ | |
awk "{if(length(\$2)<= $n || \$2 ~ /^[^aeiou]*$/ )print \$0}"|\ | |
head -n 35|\ | |
sed '1 i\count\tstopword' |\ | |
sed '1d' |\ | |
tawk '{print $2}' |\ | |
sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' |\ | |
tr '\n' '|' |\ | |
sed 's:|$::g' | |
) | |
cat tiger.csv |\ | |
eval $rm_stopwords |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# make three distinct lists of stopwords out of tiger road fullname unigrams | |
# NB: each list a subset of the inverse of the previous list | |
# NB: uses dotfiles for data science | |
# 1 too short or includes number | |
# 2 has no vowels | |
# 3 top _n_ most common whole words (not too short, has vowel, doesn't include number) | |
# NB: be generous with the whole words list as you will want to curate it by hand at least a little to avoid making meaningful stuff into a stopword | |
# take this many of the most common whole words | |
ntop_wholeWords=400 | |
# consider any unigram shorter than this to be a stopword | |
minlength=4 | |
# this is our temporary unigram list - we will progressively shorten this so that one type of list does not bleed into another type | |
unigrams=$(mktemp) | |
# just a temp file to store the street names - should rewrite ngram function to take STDIN actually | |
tmp=$(mktemp) | |
# make that list of unigrams | |
cat fullnames_uniq.txt | c 2 | sed '1d'> $tmp | |
# get a unique list of unigrams with counts of frequency | |
ngrams $tmp 1 | sortfreq > $unigrams | |
# start building our 3 types of stopwords lists | |
# first up is too short or contains a number | |
stopwords_shortOrNum=$( cat $unigrams | mawk "{if( \$2 ~ /[0-9]/|| length(\$2)< $minlength )print \$0}" ) | |
# save these stopwords to file | |
echo "$stopwords_shortOrNum" | c 2 > stop_shortOrNum | |
# remove these matches from the unique frequency unigram list | |
cat $unigrams | grep -vFf <( echo "$stopwords_shortOrNum" ) | sponge $unigrams | |
# next up! make a stopword list when unigrams do not have an ASCII vowel | |
stopwords_noVowel=$( cat $unigrams | mawk "{if(\$2 ~ /^[^aeiouy]*$/ )print \$0}" ) | |
# write that list to file | |
echo "$stopwords_noVowel" | c 2 > stop_noVowel | |
# remove it from the unique unigram freq list | |
cat $unigrams | grep -vFf <( echo "$stopwords_noVowel" ) | sponge $unigrams | |
# now get a list of the _n_ top unigrams that are left by frequency | |
# be generous with the length of this list, as you may have to remove potential stopwords from it by hand | |
# whole words are gold | |
cat $unigrams | mawk "{print \$0}" | sed '1d' | head -n $ntop_wholeWords | c 2 > stop_wholeWords |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
in=/tmp/streets; cp $in /tmp/1; rm_all_stopwords=$( cat stopwords/* | sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' | sed 's:$:\|\\:g' ); echo "$rm_all_stopwords" | parallel -j1 --pipe -N1000 'rm_stop=$( cat | sed "$ s:|\\\\$::g" ); cat /tmp/{#} | eval "$rm_stop" > /tmp/$(expr {#} + 1); rm /tmp/{#}' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment