Last active
July 7, 2021 07:34
-
-
Save dontcallmedom/5f620e1d2dc5ff9467b210e81a51b637 to your computer and use it in GitHub Desktop.
RFC references extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# we're in a directory where the plain text RFCs and their json metadata have been extracted | |
# move obsolete RFCs to an obsolete dir | |
mkdir obsolete | |
for i in *.json ; do if [ -n "`jq '.obsoleted_by[]' $i`" ] ; then mv $i obsolete/ ; fi ; done | |
# working dir | |
mkdir refs | |
# directory where we will store the results: | |
# one file per RFC which lists the name of normatively referenced rfcs | |
mkdir rfc-refs | |
cd refs | |
# Ignore Informational RFCs and only deal with those that use the phrase "Normative References" as heading (not prefixed with "Non" or inside some other text, but allowing for prefixes like "A." or "Appendix" or "7.1") | |
for i in `grep -L "Informational" ../*.txt|xargs grep -l "Normative References"` ; do csplit -s -z $i "/^\([A0-9 ][^-\"]*\)*Normative References/" '{*}' -f "`basename $i .txt`-" ; done | |
# TODO: Deal with 6 documents that use the phrase "Normative references" (case) | |
# TODO: Deal with 3 documents that use the phrase "Normative" (case) | |
# TODO: Deal with 717 documents that don't use the phrase "Normative Refences" | |
# grep -L "Informational" *.txt|xargs grep -L "Experimental"|xargs grep -L "Normative References"|xargs grep -L "Informative References"|xargs grep "References" | |
# Keep the last piece of the split that doesn contain "Non-Normative References" | |
# Remove pagination lines (they start with a character and are 72 characters wide) | |
# and split-off first section by heading (lines starting with a character) | |
for i in *-00 ; do root=`echo $i|cut -d '-' -f 1` ; last=`ls $root-*|tail -1` ; cat $last|sed '/^\S.\{70\}./d'|csplit -s -z - "/^[0-9a-zA-Z]/" '{1}' -f $last"-"; done | |
# in the list of rerefences, extract anything that looks like an RFC after having removed in-paragraph line returns | |
for i in *-*-00 ; do rfc=`echo $i|cut -d '-' -f 1` ; cat $i|tr "\n" "_"|sed -e "s/__/\n/g"|sed -e "s/_ */ /g"|grep "RFC \?[0-9]"|sed -e "s/^.*\(RFC \?[0-9][0-9]*\).*$/\1/"|tr -d " "|sort|uniq > ../rfc-refs/$rfc ; done | |
# cd ../rfc-refs/ | |
# wc -l|sort -rn | |
# 0 rfc5234 no rfc ref | |
# 0 rfc4506 no rfc ref | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment