Last active
August 29, 2015 14:24
-
-
Save albert-decatur/2c6d28f554913ebc1190 to your computer and use it in GitHub Desktop.
from eVA, get dates and costs for procurements
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for every agency, get name,isodates,cost | |
# cost is just units*unit price | |
# ignore cost if it's <1 USD | |
in=eVA_healthyRecords_2015-07-12.tsv | |
while read agency; do cat $in | tawk "{if(\$2 ~ /\"${agency}\"/ && \$5*\$6 > 1)print \$2,\$3,\$6*\$5}" | sed 's:"::g' | sed 's:\t\([0-9]\{2\}\)\/\([0-9]\{2\}\)\/\([0-9]\{4\}\)\t:\t\3-\1-\2\t:g'; done < /tmp/agencies | sed '1 i\agency\tdate\tamount' >/tmp/eVA_costByDateByAgency.txt | |
# next take it into a db and group by date | |
~/git/aiddata-utils/etl/txt2pgsql.pl -i /tmp/eVA_rva.tsv -d "\t" -t "TEXT" -p del | sh | |
# group costs by agency,date | |
echo "copy ( select agency,date,sum(amount::numeric) as cost from \"eVA_rva\" group by agency,date ) to stdout with csv header;" | psql del > /tmp/eVA_rva.csv |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cumulative cost of procurement by agency over time | |
# where fields are 1) agency name, 2) date order asc, 3) cost | |
library(ggplot2) | |
# to transform data | |
library(plyr) | |
# needed for commas in y axis breaks | |
library(scales) | |
# to label log scale axis with sensible ticks | |
df <- read.csv("extras/vcu_v_publicSchools.csv") | |
df.t <- ddply(df,.(agency),transform,ccost=cumsum(cost)) | |
# make breaks for plot - these will follow log scale but in plain notation | |
breaks = 10**(1:10) | |
# plot cumulative cost over time by agency | |
# use log axis for y | |
# but breaks will not be in sci notation | |
# NB: casting date fields as.Date works wonders for axis labels, plot background, etc! | |
ggplot(df.t, aes(x=as.Date(date), y=ccost, colour=agency, group=agency)) + geom_line() + coord_trans(y = "log10") + labs(x="date",y="cumulative spending, USD, (log scale)",title="title") + scale_y_log10(breaks = breaks, labels = comma(breaks)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
in=eVA_yearsWithDataByAgency.tsv; cat eVA_totalCostByAgency.tsv |head -n 26 | c 1 | sed '1d' | tac | parallel 'yrs_avail=$(mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in' | cut -f 2); yrs_not_avail=$(hash_diff.pl -a <(echo "$yrs_avail") -b <(seq 2001 2015) | grep lista | grep -oE "^[^:]*" ); echo "$yrs_not_avail" | sed "s:^:$(echo {})\t:g"' | sed '1 i\agency\tyear_unavilable' > /tmp/foo.tsv |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in data with these conditions: | |
# vars are agency names, year with missing data, total agency cost (just for scaling point geoms) | |
# sorted by agency total cost asc | |
# sorted asc b/c coord_flip() flips that value | |
df <- read.csv("/tmp/foo.csv",sep="\t") | |
library(ggplot2) | |
# make sure to put this in the order the data appears,which is by total cost FY2001-2015 | |
df$agency <- factor(df$agency, levels=unique(df$agency)) | |
# plot with no legend, right justified title, use black outlines for minor panel grid, use tiles as geom and color by year | |
ggplot(df,aes(x=df$agency,y=df$year)) + coord_flip() + labs(x="eVA agency",y="year",title="Years with No Data by Agency\nState of Virginia Procurement, eVA FY2001-2015") + theme(plot.title = element_text(hjust=1)) + theme(legend.position="none") + geom_tile(fill=df$year) + theme(panel.grid.minor = element_line(colour = "black")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read.csv a table with agency name, count of procurements, month (needs to be YYYY-MM-DD for this example but you can just set them all to the first) | |
png("/tmp/foo.png",unit="in",res=600,height=6,width=12);ggplot(df,aes(x=df$agency,y=df$month)) + coord_flip() + labs(x="eVA agency",y="year",title="Number of Procurements By Month,\neVA FY2001-2015\n(Top 25 Agencies by Spending)") + theme(legend.position="none") + scale_y_date(breaks = date_breaks("years"),labels = date_format("%Y")) + theme(axis.text.x = element_text(angle=290,vjust=.4)) + theme(panel.background = element_blank()) + geom_tile(aes(fill=df$count_po));dev.off() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# given a table of agency, date, cost | |
# produce a table of agency, month,procurement count, USD spent on procurement | |
# NB: none of this is inflation adjusted | |
in=eVA_costByDateByAgency.tsv; cat eVA_totalCostByAgency.tsv |head -n 26 | c 1 | sed '1d' | tac | parallel 'agency_records=$( mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in'); byMonth=$(echo "$agency_records" | cut -f2 | sed "s:...$::g" ); countByMonth=$( echo "$byMonth" | sort | uniq -c | sed "s:^\s\+::g;s:\s:\t:g" ); echo "$byMonth" | sort | uniq | while read month; do count=$( echo "$countByMonth" | grep -E "\b${month}\b" | cut -f1 ); monthly_cost=$( echo "$agency_records" | mawk -F"\t" "{if(\$2 ~ /^${month}/)print \$3}" | awk "{ sum += \$1 } END { printf \"%.4f\n\", sum }" ); echo -e {}"\t$(echo $month | sed "s:$:-01:g")\t$count\t$monthly_cost"; done' | sed '1 i\agency\tmonth\tpo_count\tpo_spend' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
in=eVA_costByMonthByAgency.tsv; cat $in | sed '1d' | cut -f1 | sort | uniq | parallel 'records=$(mawk -F"\t" "{if(\$1 ~ /^$(echo {})$/)print \$0}" '$in' ); echo "$records"|mawk -F"\t" "{gsub(/-[0-9]+-[0-9]+$/,\"\",\$2);print \$2}" | sort | uniq | while read year; do agency_yr=$( echo "$records" | mawk -F"\t" "{if(\$2 ~ /^${year}-/)print \$0}" ); count_po_yr=$( echo "$agency_yr" | cut -f 3 | awk "{ sum += \$1 } END { printf \"%.0f\n\", sum }" ); spend_yr=$( echo "$agency_yr" | cut -f 4 | awk "{ sum += \$1 } END { printf \"%.0f\n\", sum }" ); echo -e {}"\t$year\t$count_po_yr\t$spend_yr"; done'|sed '1 i\agency\tyear\tpo_count_yr\tpo_spend_yr' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load spending by month with fields: | |
# agency name, month, spend for month, year, count of po | |
df <- read.csv("extras/eVA_costByMonthByAgency.tsv",sep="\t") | |
# load libraries | |
library(plyr) # for transform | |
library(ggplot2) | |
# date field as type date | |
df$date <- as.Date(df$date) | |
# get po_spend rescaled by agency/year | |
df.t <- ddply(df,.(agency,year),transform,rescaled_po_spend=rescale(po_spend)) | |
png("/tmp/foo.png",unit="in",res=300,height=6,width=12);ggplot(df.t,aes(x=df.t$agency,y=df.t$month)) + coord_flip() + labs(x="agency",y="year") + theme(legend.position="none") + scale_y_date(breaks = date_breaks("years"),labels = date_format("%Y")) + theme(axis.text.x = element_text(angle=290,vjust=.4)) + scale_fill_gradient(low="white",high="steelblue") + theme(panel.grid = element_blank()) + geom_tile(aes(fill=df.t$rescaled_po_spend),color="grey",linestyle="dotted") + ggtitle(expression(atop("High Procurement Spending Months by Agency/Year\nState of Virginia, eVA FY2001-2015", atop(italic("top 25 agencies by total procurement spending. data: bit.ly/eVA-byMonth"), "")))) + theme(plot.title = element_text(hjust=0,vjust=0));dev.off() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment