TomConlin · March 14, 2018 06:49
diff --git a/second_v2small_file_survey b/second_v2small_file_survey
 head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab | tr '\t' '\n' | grep -n .
 1:#diseaseID
 2:diseaseName
 3:phenotypeId
 4:phenotypeName
 5:onsetId
 6:onsetName
 7:frequency
 8:sex
 9:negation
 10:modifier
 11:description
 12:publication
 13:evidence
 14:assignedBy
 15:dateCreated

 ########################

 # easier to survey the records in one pile
 grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab

 # howmany rows
 wc -l < v2.tab
 92,727

 # right number of columns every time?
 awk -F'\t' 'NF!=15' v2.tab
 # yep

 # disease identifier types 
 cut -f1  -d':' v2.tab | sort | uniq -c | sort -nr
  92430 OMIM
    297 DECIPHER

 # files with the most rows
 cut -f1  v2.tab |  uniq -c | sort -nr| head 
    131 OMIM:312870
    128 OMIM:180849
    108 OMIM:607872
    108 OMIM:194050
     99 OMIM:613406
     95 OMIM:270400
     94 OMIM:194190
     87 OMIM:601803
     87 OMIM:122470
     85 OMIM:305600

 #  check if any disease identifiers in more than one file?
 cut -f1  v2.tab |  uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head  
      1 OMIM:617537
      1 OMIM:617526
      1 OMIM:617506
      1 OMIM:617478
      1 OMIM:617468
      1 OMIM:617466
      1 OMIM:617460
      1 OMIM:617452
      1 OMIM:617450
      1 OMIM:617442
 # no

 # summary stats on rows per file
 cut -f1  v2.tab |  uniq -c | sort -nr| cut -c1-8 | ./sumstat.r
       V1        
 Min.   :  1.00  
 1st Qu.:  4.00  
 Median :  8.00  
 Mean   : 12.61  
 3rd Qu.: 17.00  
 Max.   :131.00  
 [1] "sd :12.46"

 ###########################################
 # sumstat.r 
 #! /usr/bin/Rscript --vanilla
 	x <- read.csv('stdin', header = F); 
 	summary(x);
 	sprintf("sd :%.02f", sd(x[,1]));
 ###########################################





 ###########################################
 # diseaseID
 # howmany distinct disease identifiers
 cut -f1 v2.tab | uniq | wc -l
 7351

 #####################################
 # diseaseName
 # howmany distinct disease ... "word thingies"(tm)
 cut -f2 v2.tab | uniq | wc -l
 12378
 # not 1:1

 # sometimes appears to be a ';;' seperated list
 # howmany are lists? 
 cut -f2 v2.tab | sort -u | grep -c  ";;" 
 2437

 ##########################################
 # phenotypeId
 # howmany?
 cut -f3 v2.tab | wc -l
 92727
 # unique?
 cut -f3 v2.tab | sort -u | wc -l
 6994

 # have correct curie prefix?
 cut -f3 v2.tab | cut -f1 -d':' | uniq -c
  92727 HP

 ################################
 # phenotypeName
 cut -f4 v2.tab  | wc -l
 92727
 cut -f4 v2.tab  | sort -u | wc -l
 6994

 # anything not simple words? any puncuation indicating lists?
 cut -f4 v2.tab  | grep -v "[a-z A-Z]*"
 # nothing
 ################################
 # onsetId
 cut -f5 v2.tab | sort | uniq -c | sort -nr
  92194 
    134 HP:0003577
     97 HP:0003593
     87 HP:0011463
     74 HP:0003623
     50 HP:0003581
     45 HP:0003621
     33 HP:0003584
      8 HP:0011462
      3 HP:0003596
      1 HP:0011461
      1 HP:0003674

 #####################################
 # onsetName
  92194 
    134 Congenital onset
     97 Infantile onset
     87 Childhood onset
     74 Neonatal onset
     50 Adult onset
     45 Juvenile onset
     33 Late onset
      8 Young adult onset
      3 Middle age onset
      1 Onset
      1 Fetal onset

 ##########################################

 Here columns reordered from last time

 repo was not in sync on removed files 

 ################################
 # frequency  
 # sparse hodgpodge of:
 #   nothing 
 #	identifiers
 #	rationals
 #	percentages (including ranges of percentages)

 # easy: express percentages more uniformly
 # hard: recover the proper rational a percentage was derived from (curating pubs)
 #
 # mixing identifiers and rationals; 
 # I guess one sparse colomn is better than two
 # but what would really make it worthwhile is if the identifiers 
 # refrenced a value (back in the ontology) which allowed them to be comparable 
 # (even approximatly) to the proper rationals (and percentages)
 # 
 # Okay we sort of of have that but uncomputably in a string

 HP:0040283   "Occasional (29-5%)"

 Maybe something like statements along the lines of: 

 <HP:0040283>  <???:greater than or equal to>  0.05 . 
 <HP:0040283>  <???:less than or equal to>  0.29 .

 ----------------------------------------------------

 cut -f7 v2.tab | sort | uniq -c | sort -nr | head
 84602 
   5087 HP:0040283
    682 HP:0040282
    179 HP:0040281
    116 2/2
     87 3/3
     74 HP:0040280
     70 7.5000%
     59 1/3



 # are all rationals proper?
 cut -f7 v2.tab | awk -F'/' '$2>0{if($1>$2)print}'
 # of course they are. 

 ##################################
 # sex
 cut -f8 v2.tab | sort | uniq -c | sort -nr
  92647 
     58 Male
     22 Female


 ################################
 # negation
 cut -f9 v2.tab | sort | uniq -c | sort -nr
  91939 
    788 NO

 #################################
 # modifier
 cut -f10 v2.tab | sort | uniq -c | sort -nr
  91950 
    307 HP:0012825
    194 HP:0012828
     53 HP:0003676
     42 HP:0025303
     35 HP:0012829
     27 HP:0012832
     26 HP:0012833
     23 HP:0031796
     16 HP:0031375
     15 HP:0012826
     12 HP:0012840
     11 HP:0012839
      8 HP:0012837
      5 HP:0011010
      1 HP:0030650
      1 HP:0012827
      1 HP:0003831

 ########################################
 # description
 # cut -f11 v2.tab | sort | uniq -c | sort -nr | less
 # 60357  empty
 # OMIM screaming caps (mostly descriptive) 
 # and some other more random hint like statemets 
 # including data that visually seems like it belongs in other columns

 	frequency
 		5% to 13%
 		2/7

 	sex
 		in males

 	negation
 		NOT

 	onset
 		In infancy
 		School age onset
 		Onset usually before puberty
 		Onset in early childhoos               <- yep, childhoos
      	Onset by age of three years
      	Onset at birth or in childhood
      	Onset about puberty


 Not sure where I would put a description consisting of "0"

 ########################################
 # publication
 # hmmm... plural but more like
 # citations  

 # can be lists (with different seperator than previous list)
 # can be missing curie suffix  `OMIM:`  (seventy like this) 
 # can be url
 # can be ISBN
 # can mix curie case 	`PMID:17918734;pmid:12687501`
 # can be spaced out   	`PMID:    17223397`
 # can be bare integer  	`12089525` 
 # can be folks   		`HPO:sdoelken`
 # there can be space after list seperators (or not)

 # howmany are lists:
 cut -f12 v2.tab | grep -c ';'
 372


 ###############################################################
 # Note: 
 # web search on 'GO_evidence_code' returns correct code descriptions as top hit 
 # evidence
 cut -f13 v2.tab | sort | uniq -c | sort -nr 
  43897 TAS
  42405 IEA
   6400 PCS
     25 ICE


 #####################################
 # assignedBy
 cut -f14 v2.tab | sort | uniq -c | sort -nr 
  42088 HPO:skoehler
  36962 HPO:iea
  13523 HPO:probinson
     51 HPO:lccarmody
     49 HPO:sdoelken
     34 ZFIN:bruef; HPO:sdoelken
     13 HPO:curators
      6 PATOC:GVG; PATOC:PS
      1 HPO:nvasilevsky

 # Is there a good reason not to insist on ORCIDs?  
 # surely these people must be amongst the most capable of understanding why.


 ##################################################
 # date_created
 cut -f15 v2.tab | sort | uniq -c | sort -nr | head

  40213 2009-02-17     A very busy day
   7718 2017-07-13
   6532 2012-10-17
   2434 2015-12-30
   2185 2010-06-20
   1958 2010-06-19
   1145 2012-11-18
   1089 2010-06-18
   1014 2012-04-24
    942 2014-11-26

 the great thing about this format is how easy it is to spot outliers

 cut -f15 v2.tab | sort -u | head
 2009-02-17
 2009-07-24
 2009-07-31
 2009-08-31
 2009-09-17
 2009-10-01
 2009-10-02
 2009-10-09
 2009-10-15
 2009-10-16

 
 cut -f15 v2.tab | sort -u | tail
 2017-12-11
 2017-12-12
 2017-12-13
 2017-12-17
 2017-12-22
 2018-01-25
 2018-01-28
 2018-03-04
 2018-03-05
 2018-03-07
  

 # check the rest

 for date in $(cut -f15 v2.tab | sort -u); do 
 	date --date=${date}; 
 done | grep invalid
 date: invalid date ‘2018-15-20’

 # the dates all look valid.
	head -1 rare-diseases/annotated/v2smallFiles/DECIPHER-18.tab \| tr '\t' '\n' \| grep -n .
	1:#diseaseID
	2:diseaseName
	3:phenotypeId
	4:phenotypeName
	5:onsetId
	6:onsetName
	7:frequency
	8:sex
	9:negation
	10:modifier
	11:description
	12:publication
	13:evidence
	14:assignedBy
	15:dateCreated

	########################

	# easier to survey the records in one pile
	grep -h -v "^#" rare-diseases/annotated/v2smallFiles/*.tab > v2.tab

	# howmany rows
	wc -l < v2.tab
	92,727

	# right number of columns every time?
	awk -F'\t' 'NF!=15' v2.tab
	# yep

	# disease identifier types
	cut -f1 -d':' v2.tab \| sort \| uniq -c \| sort -nr
	92430 OMIM
	297 DECIPHER

	# files with the most rows
	cut -f1 v2.tab \| uniq -c \| sort -nr\| head
	131 OMIM:312870
	128 OMIM:180849
	108 OMIM:607872
	108 OMIM:194050
	99 OMIM:613406
	95 OMIM:270400
	94 OMIM:194190
	87 OMIM:601803
	87 OMIM:122470
	85 OMIM:305600

	# check if any disease identifiers in more than one file?
	cut -f1 v2.tab \| uniq -c \| sort -nr\| cut -c9- \| uniq -c \| sort -nr \| head
	1 OMIM:617537
	1 OMIM:617526
	1 OMIM:617506
	1 OMIM:617478
	1 OMIM:617468
	1 OMIM:617466
	1 OMIM:617460
	1 OMIM:617452
	1 OMIM:617450
	1 OMIM:617442
	# no

	# summary stats on rows per file
	cut -f1 v2.tab \| uniq -c \| sort -nr\| cut -c1-8 \| ./sumstat.r
	V1
	Min. : 1.00
	1st Qu.: 4.00
	Median : 8.00
	Mean : 12.61
	3rd Qu.: 17.00
	Max. :131.00
	[1] "sd :12.46"

	###########################################
	# sumstat.r
	#! /usr/bin/Rscript --vanilla
	x <- read.csv('stdin', header = F);
	summary(x);
	sprintf("sd :%.02f", sd(x[,1]));
	###########################################





	###########################################
	# diseaseID
	# howmany distinct disease identifiers
	cut -f1 v2.tab \| uniq \| wc -l
	7351

	#####################################
	# diseaseName
	# howmany distinct disease ... "word thingies"(tm)
	cut -f2 v2.tab \| uniq \| wc -l
	12378
	# not 1:1

	# sometimes appears to be a ';;' seperated list
	# howmany are lists?
	cut -f2 v2.tab \| sort -u \| grep -c ";;"
	2437

	##########################################
	# phenotypeId
	# howmany?
	cut -f3 v2.tab \| wc -l
	92727
	# unique?
	cut -f3 v2.tab \| sort -u \| wc -l
	6994

	# have correct curie prefix?
	cut -f3 v2.tab \| cut -f1 -d':' \| uniq -c
	92727 HP

	################################
	# phenotypeName
	cut -f4 v2.tab \| wc -l
	92727
	cut -f4 v2.tab \| sort -u \| wc -l
	6994

	# anything not simple words? any puncuation indicating lists?
	cut -f4 v2.tab \| grep -v "[a-z A-Z]*"
	# nothing
	################################
	# onsetId
	cut -f5 v2.tab \| sort \| uniq -c \| sort -nr
	92194
	134 HP:0003577
	97 HP:0003593
	87 HP:0011463
	74 HP:0003623
	50 HP:0003581
	45 HP:0003621
	33 HP:0003584
	8 HP:0011462
	3 HP:0003596
	1 HP:0011461
	1 HP:0003674

	#####################################
	# onsetName
	92194
	134 Congenital onset
	97 Infantile onset
	87 Childhood onset
	74 Neonatal onset
	50 Adult onset
	45 Juvenile onset
	33 Late onset
	8 Young adult onset
	3 Middle age onset
	1 Onset
	1 Fetal onset

	##########################################

	Here columns reordered from last time

	repo was not in sync on removed files

	################################
	# frequency
	# sparse hodgpodge of:
	# nothing
	# identifiers
	# rationals
	# percentages (including ranges of percentages)

	# easy: express percentages more uniformly
	# hard: recover the proper rational a percentage was derived from (curating pubs)
	#
	# mixing identifiers and rationals;
	# I guess one sparse colomn is better than two
	# but what would really make it worthwhile is if the identifiers
	# refrenced a value (back in the ontology) which allowed them to be comparable
	# (even approximatly) to the proper rationals (and percentages)
	#
	# Okay we sort of of have that but uncomputably in a string

	HP:0040283 "Occasional (29-5%)"

	Maybe something like statements along the lines of:

	<HP:0040283> <???:greater than or equal to> 0.05 .
	<HP:0040283> <???:less than or equal to> 0.29 .

	----------------------------------------------------

	cut -f7 v2.tab \| sort \| uniq -c \| sort -nr \| head
	84602
	5087 HP:0040283
	682 HP:0040282
	179 HP:0040281
	116 2/2
	87 3/3
	74 HP:0040280
	70 7.5000%
	59 1/3



	# are all rationals proper?
	cut -f7 v2.tab \| awk -F'/' '$2>0{if($1>$2)print}'
	# of course they are.

	##################################
	# sex
	cut -f8 v2.tab \| sort \| uniq -c \| sort -nr
	92647
	58 Male
	22 Female


	################################
	# negation
	cut -f9 v2.tab \| sort \| uniq -c \| sort -nr
	91939
	788 NO

	#################################
	# modifier
	cut -f10 v2.tab \| sort \| uniq -c \| sort -nr
	91950
	307 HP:0012825
	194 HP:0012828
	53 HP:0003676
	42 HP:0025303
	35 HP:0012829
	27 HP:0012832
	26 HP:0012833
	23 HP:0031796
	16 HP:0031375
	15 HP:0012826
	12 HP:0012840
	11 HP:0012839
	8 HP:0012837
	5 HP:0011010
	1 HP:0030650
	1 HP:0012827
	1 HP:0003831

	########################################
	# description
	# cut -f11 v2.tab \| sort \| uniq -c \| sort -nr \| less
	# 60357 empty
	# OMIM screaming caps (mostly descriptive)
	# and some other more random hint like statemets
	# including data that visually seems like it belongs in other columns

	frequency
	5% to 13%
	2/7

	sex
	in males

	negation
	NOT

	onset
	In infancy
	School age onset
	Onset usually before puberty
	Onset in early childhoos <- yep, childhoos
	Onset by age of three years
	Onset at birth or in childhood
	Onset about puberty


	Not sure where I would put a description consisting of "0"

	########################################
	# publication
	# hmmm... plural but more like
	# citations

	# can be lists (with different seperator than previous list)
	# can be missing curie suffix `OMIM:` (seventy like this)
	# can be url
	# can be ISBN
	# can mix curie case `PMID:17918734;pmid:12687501`
	# can be spaced out `PMID: 17223397`
	# can be bare integer `12089525`
	# can be folks `HPO:sdoelken`
	# there can be space after list seperators (or not)

	# howmany are lists:
	cut -f12 v2.tab \| grep -c ';'
	372


	###############################################################
	# Note:
	# web search on 'GO_evidence_code' returns correct code descriptions as top hit
	# evidence
	cut -f13 v2.tab \| sort \| uniq -c \| sort -nr
	43897 TAS
	42405 IEA
	6400 PCS
	25 ICE


	#####################################
	# assignedBy
	cut -f14 v2.tab \| sort \| uniq -c \| sort -nr
	42088 HPO:skoehler
	36962 HPO:iea
	13523 HPO:probinson
	51 HPO:lccarmody
	49 HPO:sdoelken
	34 ZFIN:bruef; HPO:sdoelken
	13 HPO:curators
	6 PATOC:GVG; PATOC:PS
	1 HPO:nvasilevsky

	# Is there a good reason not to insist on ORCIDs?
	# surely these people must be amongst the most capable of understanding why.


	##################################################
	# date_created
	cut -f15 v2.tab \| sort \| uniq -c \| sort -nr \| head

	40213 2009-02-17 A very busy day
	7718 2017-07-13
	6532 2012-10-17
	2434 2015-12-30
	2185 2010-06-20
	1958 2010-06-19
	1145 2012-11-18
	1089 2010-06-18
	1014 2012-04-24
	942 2014-11-26

	the great thing about this format is how easy it is to spot outliers

	cut -f15 v2.tab \| sort -u \| head
	2009-02-17
	2009-07-24
	2009-07-31
	2009-08-31
	2009-09-17
	2009-10-01
	2009-10-02
	2009-10-09
	2009-10-15
	2009-10-16


	cut -f15 v2.tab \| sort -u \| tail
	2017-12-11
	2017-12-12
	2017-12-13
	2017-12-17
	2017-12-22
	2018-01-25
	2018-01-28
	2018-03-04
	2018-03-05
	2018-03-07


	# check the rest

	for date in $(cut -f15 v2.tab \| sort -u); do
	date --date=${date};
	done \| grep invalid
	date: invalid date ‘2018-15-20’

	# the dates all look valid.