Albert Decatur albert-decatur

crouton tips for chromebook

# making chroots in any diredctory you want, eg on removeable media
sudo sh ~/Downloads/crouton -r trusty -t x11 -p /media/removable/USB\ Drive/crouton/chroots/

# enter chroot from removeable media
sudo enter-chroot -c /media/removable/USB\ Drive/crouton/chroots/

#identifying stopwords using ngrams and vowels

we have: street names in Richmond, VA
we want to: match street names to Confederate generals and Civil Rights leaders
first we must: remove small pesky elements of names that are irrelevant to the match

the solution: ngrams to build a stopword list!

prerequisites

simplify shapefile

convert to geojson

remove null features

prerequisites

the incomparable mapshaper
jq, your favorite JSON filter

	#!/bin/bash

	# bootstrap a vagrant ubuntu guest to have the following:
	# openssh server
	# x2go server
	# ufw
	# fail2ban
	# user args: STDIN is TSV of "username\tpubkey", one per line
	# NB:
	# run as root

	<?php
	// use TextStatistics to get syllable counts from STDIN
	// https://github.com/DaveChild/Text-Statistics
	foreach (glob("DaveChild/TextStatistics/*.php") as $filename)
	{
	include $filename;
	}
	$file = file_get_contents("php://stdin");
	echo DaveChild\TextStatistics\Syllables::syllableCount($file);
	?>

	# Insert your preferred key mappings here.
	unmap d
	map <c-f> scrollPageDown
	unmap u
	map <c-b> scrollPageUp
	unmap d
	map d removeTab
	unmap u
	map u restoreTab

	# for every agency, get name,isodates,cost
	# cost is just units*unit price
	# ignore cost if it's <1 USD
	in=eVA_healthyRecords_2015-07-12.tsv
	while read agency; do cat $in \| tawk "{if(\$2 ~ /\"${agency}\"/ && \$5\$6 > 1)print \$2,\$3,\$6\$5}" \| sed 's:"::g' \| sed 's:\t\([0-9]\{2\}\)\/\([0-9]\{2\}\)\/\([0-9]\{4\}\)\t:\t\3-\1-\2\t:g'; done < /tmp/agencies \| sed '1 i\agency\tdate\tamount' >/tmp/eVA_costByDateByAgency.txt
	# next take it into a db and group by date
	~/git/aiddata-utils/etl/txt2pgsql.pl -i /tmp/eVA_rva.tsv -d "\t" -t "TEXT" -p del \| sh
	# group costs by agency,date
	echo "copy ( select agency,date,sum(amount::numeric) as cost from \"eVA_rva\" group by agency,date ) to stdout with csv header;" \| psql del > /tmp/eVA_rva.csv

	# bash magic - passing an arbitrary number of sed commands to eval on a file to remove stopword list we built based on 1-gram frequency and lack of vowels
	# first we get a list of stopwords based on short strings, esp. w/o vowels, that are the most common 1-grams
	# then we build sed commands around these stopwords, using word boundaries (eg \bRd\b)
	# and we ask for a case-insensitive match (eg sed 's/pattern//I')
	# then we string them together with pipes
	# and cat the file of interst and eval all the sed commands!
	# this removes the stopword list we build, with case-insensitive match, and using word boundaries
	# advantage over sed OR (eg sed 's/Rd\\|St//I') is that that can only do some many at a time
	# would be more efficient of course to batch these into groups of n

	# download them all
	wget -ci <( baseuri="ftp://ftp2.census.gov/geo/tiger/TIGER2014/ROADS/"; escaped_baseuri=$(echo "$baseuri" \| sed "s:\/:\\\/:g" \| sed "s/:/\\\:/g"); lftp -e 'find;exit' "$baseuri" \| sed "s:^:${escaped_baseuri}:g" )
	# make single text file of unique FULLNAME roads names
	find_ext zip \| parallel -j2 'tmp=$(mktemp -d); unzip -d $tmp {} $(basename {} .zip).dbf; dbfdump --fields FULLNAME $tmp/.dbf \| grep -vE "^\s$"; rm -r $tmp' \| grep -vE "(Archive\|inflating)[:]" \| sed '1 i\FULLNAME'\| sortfreq > ~/fullname_uniq.txt