Created
March 14, 2018 06:49
-
-
Save TomConlin/31ed4f2b87bedb946de7b908d08862b2 to your computer and use it in GitHub Desktop.
Second round of v2small files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n . | |
1:#diseaseID | |
2:diseaseName | |
3:phenotypeId | |
4:phenotypeName | |
5:onsetId | |
6:onsetName | |
7:frequency | |
8:sex | |
9:negation | |
10:modifier | |
11:description | |
12:publication | |
13:evidence | |
14:assignedBy | |
15:dateCreated | |
######################## | |
# easier to survey the records in one pile | |
grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab | |
# howmany rows | |
wc -l < v2.tab | |
92,727 | |
# right number of columns every time? | |
awk -F'\t' 'NF!=15' v2.tab | |
# yep | |
# disease identifier types | |
cut -f1 -d':' v2.tab | sort | uniq -c | sort -nr | |
92430 OMIM | |
297 DECIPHER | |
# files with the most rows | |
cut -f1 v2.tab | uniq -c | sort -nr| head | |
131 OMIM:312870 | |
128 OMIM:180849 | |
108 OMIM:607872 | |
108 OMIM:194050 | |
99 OMIM:613406 | |
95 OMIM:270400 | |
94 OMIM:194190 | |
87 OMIM:601803 | |
87 OMIM:122470 | |
85 OMIM:305600 | |
# check if any disease identifiers in more than one file? | |
cut -f1 v2.tab | uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head | |
1 OMIM:617537 | |
1 OMIM:617526 | |
1 OMIM:617506 | |
1 OMIM:617478 | |
1 OMIM:617468 | |
1 OMIM:617466 | |
1 OMIM:617460 | |
1 OMIM:617452 | |
1 OMIM:617450 | |
1 OMIM:617442 | |
# no | |
# summary stats on rows per file | |
cut -f1 v2.tab | uniq -c | sort -nr| cut -c1-8 | ./sumstat.r | |
V1 | |
Min. : 1.00 | |
1st Qu.: 4.00 | |
Median : 8.00 | |
Mean : 12.61 | |
3rd Qu.: 17.00 | |
Max. :131.00 | |
[1] "sd :12.46" | |
########################################### | |
# sumstat.r | |
#! /usr/bin/Rscript --vanilla | |
x <- read.csv('stdin', header = F); | |
summary(x); | |
sprintf("sd :%.02f", sd(x[,1])); | |
########################################### | |
########################################### | |
# diseaseID | |
# howmany distinct disease identifiers | |
cut -f1 v2.tab | uniq | wc -l | |
7351 | |
##################################### | |
# diseaseName | |
# howmany distinct disease ... "word thingies"(tm) | |
cut -f2 v2.tab | uniq | wc -l | |
12378 | |
# not 1:1 | |
# sometimes appears to be a ';;' seperated list | |
# howmany are lists? | |
cut -f2 v2.tab | sort -u | grep -c ";;" | |
2437 | |
########################################## | |
# phenotypeId | |
# howmany? | |
cut -f3 v2.tab | wc -l | |
92727 | |
# unique? | |
cut -f3 v2.tab | sort -u | wc -l | |
6994 | |
# have correct curie prefix? | |
cut -f3 v2.tab | cut -f1 -d':' | uniq -c | |
92727 HP | |
################################ | |
# phenotypeName | |
cut -f4 v2.tab | wc -l | |
92727 | |
cut -f4 v2.tab | sort -u | wc -l | |
6994 | |
# anything not simple words? any puncuation indicating lists? | |
cut -f4 v2.tab | grep -v "[a-z A-Z]*" | |
# nothing | |
################################ | |
# onsetId | |
cut -f5 v2.tab | sort | uniq -c | sort -nr | |
92194 | |
134 HP:0003577 | |
97 HP:0003593 | |
87 HP:0011463 | |
74 HP:0003623 | |
50 HP:0003581 | |
45 HP:0003621 | |
33 HP:0003584 | |
8 HP:0011462 | |
3 HP:0003596 | |
1 HP:0011461 | |
1 HP:0003674 | |
##################################### | |
# onsetName | |
92194 | |
134 Congenital onset | |
97 Infantile onset | |
87 Childhood onset | |
74 Neonatal onset | |
50 Adult onset | |
45 Juvenile onset | |
33 Late onset | |
8 Young adult onset | |
3 Middle age onset | |
1 Onset | |
1 Fetal onset | |
########################################## | |
Here columns reordered from last time | |
repo was not in sync on removed files | |
################################ | |
# frequency | |
# sparse hodgpodge of: | |
# nothing | |
# identifiers | |
# rationals | |
# percentages (including ranges of percentages) | |
# easy: express percentages more uniformly | |
# hard: recover the proper rational a percentage was derived from (curating pubs) | |
# | |
# mixing identifiers and rationals; | |
# I guess one sparse colomn is better than two | |
# but what would really make it worthwhile is if the identifiers | |
# refrenced a value (back in the ontology) which allowed them to be comparable | |
# (even approximatly) to the proper rationals (and percentages) | |
# | |
# Okay we sort of of have that but uncomputably in a string | |
HP:0040283 "Occasional (29-5%)" | |
Maybe something like statements along the lines of: | |
<HP:0040283> <???:greater than or equal to> 0.05 . | |
<HP:0040283> <???:less than or equal to> 0.29 . | |
---------------------------------------------------- | |
cut -f7 v2.tab | sort | uniq -c | sort -nr | head | |
84602 | |
5087 HP:0040283 | |
682 HP:0040282 | |
179 HP:0040281 | |
116 2/2 | |
87 3/3 | |
74 HP:0040280 | |
70 7.5000% | |
59 1/3 | |
# are all rationals proper? | |
cut -f7 v2.tab | awk -F'/' '$2>0{if($1>$2)print}' | |
# of course they are. | |
################################## | |
# sex | |
cut -f8 v2.tab | sort | uniq -c | sort -nr | |
92647 | |
58 Male | |
22 Female | |
################################ | |
# negation | |
cut -f9 v2.tab | sort | uniq -c | sort -nr | |
91939 | |
788 NO | |
################################# | |
# modifier | |
cut -f10 v2.tab | sort | uniq -c | sort -nr | |
91950 | |
307 HP:0012825 | |
194 HP:0012828 | |
53 HP:0003676 | |
42 HP:0025303 | |
35 HP:0012829 | |
27 HP:0012832 | |
26 HP:0012833 | |
23 HP:0031796 | |
16 HP:0031375 | |
15 HP:0012826 | |
12 HP:0012840 | |
11 HP:0012839 | |
8 HP:0012837 | |
5 HP:0011010 | |
1 HP:0030650 | |
1 HP:0012827 | |
1 HP:0003831 | |
######################################## | |
# description | |
# cut -f11 v2.tab | sort | uniq -c | sort -nr | less | |
# 60357 empty | |
# OMIM screaming caps (mostly descriptive) | |
# and some other more random hint like statemets | |
# including data that visually seems like it belongs in other columns | |
frequency | |
5% to 13% | |
2/7 | |
sex | |
in males | |
negation | |
NOT | |
onset | |
In infancy | |
School age onset | |
Onset usually before puberty | |
Onset in early childhoos <- yep, childhoos | |
Onset by age of three years | |
Onset at birth or in childhood | |
Onset about puberty | |
Not sure where I would put a description consisting of "0" | |
######################################## | |
# publication | |
# hmmm... plural but more like | |
# citations | |
# can be lists (with different seperator than previous list) | |
# can be missing curie suffix `OMIM:` (seventy like this) | |
# can be url | |
# can be ISBN | |
# can mix curie case `PMID:17918734;pmid:12687501` | |
# can be spaced out `PMID: 17223397` | |
# can be bare integer `12089525` | |
# can be folks `HPO:sdoelken` | |
# there can be space after list seperators (or not) | |
# howmany are lists: | |
cut -f12 v2.tab | grep -c ';' | |
372 | |
############################################################### | |
# Note: | |
# web search on 'GO_evidence_code' returns correct code descriptions as top hit | |
# evidence | |
cut -f13 v2.tab | sort | uniq -c | sort -nr | |
43897 TAS | |
42405 IEA | |
6400 PCS | |
25 ICE | |
##################################### | |
# assignedBy | |
cut -f14 v2.tab | sort | uniq -c | sort -nr | |
42088 HPO:skoehler | |
36962 HPO:iea | |
13523 HPO:probinson | |
51 HPO:lccarmody | |
49 HPO:sdoelken | |
34 ZFIN:bruef; HPO:sdoelken | |
13 HPO:curators | |
6 PATOC:GVG; PATOC:PS | |
1 HPO:nvasilevsky | |
# Is there a good reason not to insist on ORCIDs? | |
# surely these people must be amongst the most capable of understanding why. | |
################################################## | |
# date_created | |
cut -f15 v2.tab | sort | uniq -c | sort -nr | head | |
40213 2009-02-17 A very busy day | |
7718 2017-07-13 | |
6532 2012-10-17 | |
2434 2015-12-30 | |
2185 2010-06-20 | |
1958 2010-06-19 | |
1145 2012-11-18 | |
1089 2010-06-18 | |
1014 2012-04-24 | |
942 2014-11-26 | |
the great thing about this format is how easy it is to spot outliers | |
cut -f15 v2.tab | sort -u | head | |
2009-02-17 | |
2009-07-24 | |
2009-07-31 | |
2009-08-31 | |
2009-09-17 | |
2009-10-01 | |
2009-10-02 | |
2009-10-09 | |
2009-10-15 | |
2009-10-16 | |
cut -f15 v2.tab | sort -u | tail | |
2017-12-11 | |
2017-12-12 | |
2017-12-13 | |
2017-12-17 | |
2017-12-22 | |
2018-01-25 | |
2018-01-28 | |
2018-03-04 | |
2018-03-05 | |
2018-03-07 | |
# check the rest | |
for date in $(cut -f15 v2.tab | sort -u); do | |
date --date=${date}; | |
done | grep invalid | |
date: invalid date ‘2018-15-20’ | |
# the dates all look valid. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment