albert-decatur · August 29, 2015 14:24
diff --git a/A magic_stopword_removal.sh b/A magic_stopword_removal.sh
 # bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels
 # first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams
 # then we build sed commands around these stopwords, using word boundaries (eg \bRd\b)
 # and we ask for a case-insensitive match (eg sed 's/pattern//I')
 # then we string them together with pipes
 # and cat the file of interst and eval all the sed commands!
 # this removes the stopword list we build, with case-insensitive match, and using word boundaries
 # advantage over sed OR (eg sed 's/Rd\|St//I') is that that can only do some many at a time
 # would be more efficient of course to batch these into groups of *n*

 rm_stopwords=$(
    n=6
    ngrams tiger.csv 1 |\
    sortfreq |\
    sed '1d' |\
    awk "{if(length(\$2)<= $n || \$2 ~ /^[^aeiou]*$/ )print \$0}"|\
    head -n 35|\
    sed '1 i\count\tstopword' |\
    sed '1d' |\
    tawk '{print $2}'  |\
    sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' |\
    tr '\n' '|' |\
    sed 's:|$::g' 
 )
 cat tiger.csv |\
 eval $rm_stopwords
diff --git a/B 3types_of_stopwords.sh b/B 3types_of_stopwords.sh
 # make three distinct lists of stopwords out of tiger road fullname unigrams 
 # NB: each list a subset of the inverse of the previous list
 # NB: uses dotfiles for data science
 # 1 too short or includes number
 # 2 has no vowels
 # 3 top _n_ most common whole words (not too short, has vowel, doesn't include number)
 # NB: be generous with the whole words list as you will want to curate it by hand at least a little to avoid making meaningful stuff into a stopword

 # take this many of the most common whole words
 ntop_wholeWords=400
 # consider any unigram shorter than this to be a stopword
 minlength=4
 # this is our temporary unigram list - we will progressively shorten this so that one type of list does not bleed into another type
 unigrams=$(mktemp)

 # just a temp file to store the street names - should rewrite ngram function to take STDIN actually
 tmp=$(mktemp)
 # make that list of unigrams
 cat fullnames_uniq.txt | c 2  | sed '1d'> $tmp
 # get a unique list of unigrams with counts of frequency
 ngrams $tmp 1 | sortfreq > $unigrams
 # start building our 3 types of stopwords lists
 # first up is too short or contains a number
 stopwords_shortOrNum=$( cat $unigrams | mawk "{if( \$2 ~ /[0-9]/|| length(\$2)< $minlength )print \$0}" )
 # save these stopwords to file
 echo "$stopwords_shortOrNum" | c 2 > stop_shortOrNum
 # remove these matches from the unique frequency unigram list
 cat $unigrams | grep -vFf <( echo "$stopwords_shortOrNum" ) | sponge $unigrams
 # next up! make a stopword list when unigrams do not have an ASCII vowel
 stopwords_noVowel=$( cat $unigrams | mawk "{if(\$2 ~ /^[^aeiouy]*$/ )print \$0}" )
 # write that list to file
 echo "$stopwords_noVowel" | c 2 > stop_noVowel
 # remove it from the unique unigram freq list
 cat $unigrams | grep -vFf <( echo "$stopwords_noVowel" ) | sponge $unigrams
 # now get a list of the _n_ top unigrams that are left by frequency
 # be generous with the length of this list, as you may have to remove potential stopwords from it by hand
 # whole words are gold
 cat $unigrams | mawk "{print \$0}" | sed '1d' | head -n $ntop_wholeWords | c 2 > stop_wholeWords
diff --git a/C eval arbitrary number of commands using GNU parallel b/C eval arbitrary number of commands using GNU parallel
 in=/tmp/streets; cp $in /tmp/1; rm_all_stopwords=$( cat stopwords/* | sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' | sed 's:$:\|\\:g' ); echo "$rm_all_stopwords" | parallel -j1 --pipe -N1000 'rm_stop=$( cat | sed "$ s:|\\\\$::g" ); cat /tmp/{#} | eval "$rm_stop" > /tmp/$(expr {#} + 1); rm /tmp/{#}'
	# bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels
	# first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams
	# then we build sed commands around these stopwords, using word boundaries (eg \bRd\b)
	# and we ask for a case-insensitive match (eg sed 's/pattern//I')
	# then we string them together with pipes
	# and cat the file of interst and eval all the sed commands!
	# this removes the stopword list we build, with case-insensitive match, and using word boundaries
	# advantage over sed OR (eg sed 's/Rd\\|St//I') is that that can only do some many at a time
	# would be more efficient of course to batch these into groups of n

	rm_stopwords=$(
	n=6
	ngrams tiger.csv 1 \|\
	sortfreq \|\
	sed '1d' \|\
	awk "{if(length(\$2)<= $n \|\| \$2 ~ /^[^aeiou]*$/ )print \$0}"\|\
	head -n 35\|\
	sed '1 i\count\tstopword' \|\
	sed '1d' \|\
	tawk '{print $2}' \|\
	sed 's:^:sed "s/\\b:g;s:$:\\b//I":g' \|\
	tr '\n' '\|' \|\
	sed 's:\|$::g'
	)
	cat tiger.csv \|\
	eval $rm_stopwords
	# make three distinct lists of stopwords out of tiger road fullname unigrams
	# NB: each list a subset of the inverse of the previous list
	# NB: uses dotfiles for data science
	# 1 too short or includes number
	# 2 has no vowels
	# 3 top _n_ most common whole words (not too short, has vowel, doesn't include number)
	# NB: be generous with the whole words list as you will want to curate it by hand at least a little to avoid making meaningful stuff into a stopword

	# take this many of the most common whole words
	ntop_wholeWords=400
	# consider any unigram shorter than this to be a stopword
	minlength=4
	# this is our temporary unigram list - we will progressively shorten this so that one type of list does not bleed into another type
	unigrams=$(mktemp)

	# just a temp file to store the street names - should rewrite ngram function to take STDIN actually
	tmp=$(mktemp)
	# make that list of unigrams
	cat fullnames_uniq.txt \| c 2 \| sed '1d'> $tmp
	# get a unique list of unigrams with counts of frequency
	ngrams $tmp 1 \| sortfreq > $unigrams
	# start building our 3 types of stopwords lists
	# first up is too short or contains a number
	stopwords_shortOrNum=$( cat $unigrams \| mawk "{if( \$2 ~ /[0-9]/\|\| length(\$2)< $minlength )print \$0}" )
	# save these stopwords to file
	echo "$stopwords_shortOrNum" \| c 2 > stop_shortOrNum
	# remove these matches from the unique frequency unigram list
	cat $unigrams \| grep -vFf <( echo "$stopwords_shortOrNum" ) \| sponge $unigrams
	# next up! make a stopword list when unigrams do not have an ASCII vowel
	stopwords_noVowel=$( cat $unigrams \| mawk "{if(\$2 ~ /^[^aeiouy]*$/ )print \$0}" )
	# write that list to file
	echo "$stopwords_noVowel" \| c 2 > stop_noVowel
	# remove it from the unique unigram freq list
	cat $unigrams \| grep -vFf <( echo "$stopwords_noVowel" ) \| sponge $unigrams
	# now get a list of the _n_ top unigrams that are left by frequency
	# be generous with the length of this list, as you may have to remove potential stopwords from it by hand
	# whole words are gold
	cat $unigrams \| mawk "{print \$0}" \| sed '1d' \| head -n $ntop_wholeWords \| c 2 > stop_wholeWords