Skip to content

Instantly share code, notes, and snippets.

@tjmgis
Created July 31, 2013 15:36
Show Gist options
  • Save tjmgis/6123095 to your computer and use it in GitHub Desktop.
Save tjmgis/6123095 to your computer and use it in GitHub Desktop.
Python script for splitting OS AddressBase Premium CSV files by Record Indentifier
#-------------------------------------------------------------------------------
# Name: `OS AddressBase Premium Splitter v0.1
# Purpose: `This Python script is used to read OS AddressBase Premium CSV
# files and split them by record identifier type.
#
# Author: `Tim Martin, GIS Consultant, Pre and Post Sale Support
#
# Created: `31/07/2013
# Copyright: `(c) Ordnance Survey 2013
# Licence: `THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
#-------------------------------------------------------------------------------
#Imported modules
import zipfile
import csv
import sys
import os
import StringIO
import time
from time import strftime
#Header lines for the new CSV files these are used later to when writing the header to the new CSV files
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
headings11=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","RECORD_TYPE","SWA_ORG_REF_NAMING","STATE","STATE_DATE","STREET_SURFACE","STREET_CLASSIFICATION","VERSION","STREET_START_DATE","STREET_END_DATE","LAST_UPDATE_DATE","RECORD_ENTRY_DATE","STREET_START_X","STREET_START_Y","STREET_END_X","STREET_END_Y","STREET_TOLERANCE"]
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
headings21=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","LOGICAL_STATUS","BLPU_STATE","BLPU_STATE_DATE","PARENT_UPRN","X_COORDINATE","Y_COORDINATE","RPC","LOCAL_CUSTODIAN_CODE","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE","POSTAL_ADDRESS","POSTCODE_LOCATOR","MULTI_OCC_COUNT"]
headings23=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","XREF_KEY","CROSS_REFERENCE","VERSION","SOURCE","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE"]
headings24=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","LPI_KEY","LANGUAGE","LOGICAL_STATUS","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE","SAO_START_NUMBER","SAO_START_SUFFIX","SAO_END_NUMBER","SAO_END_SUFFIX","SAO_TEXT","PAO_START_NUMBER","PAO_START_SUFFIX","PAO_END_NUMBER","PAO_END_SUFFIX","PAO_TEXT","USRN","USRN_MATCH_INDICATOR","AREA_NAME","LEVEL","OFFICIAL_FLAG"]
headings28=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","PARENT_ADDRESSABLE_UPRN","UDPRN","ORGANISATION_NAME","DEPARTMENT_NAME","SUB_BUILDING_NAME","BUILDING_NAME","BUILDING_NUMBER","DEPENDENT_THOROUGHFARE_NAME","THROUGHFARE_NAME","DOUBLE_DEPENDENT_LOCALITY","DEPENDENT_LOCALITY","POST_TOWN","POSTCODE","POSTCODE_TYPE","WELSH_DEPENDENT_THOROUGHFARE","WELSH_THOROUGHFARE_NAME","WELSH_DOUBLE_DEPENDENT_LOCALITY","WELSH_DEPENDENT_LOCALITY","WELSH_POST_TOWN","PO_BOX_NUMBER","PROCESS_DATE","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE"]
headings29=["RECORD_IDENTIFIER","GAZ_NAME","GAZ_SCOPE","TER_OF_USE","LINKED_DATA","GAZ_OWNER","NGAZ_FREQ","CUSTODIAN_NAME","CUSTODIAN_UPRN","LOCAL_CUSTODIAN_CODE","CO_ORD_SYSTEM","CO_ORD_UNIT","META_DATE","CLASS_SCHEME","GAZ_DATE","LANGUAGE","CHARACTER_SET"]
headings30=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","SUCC_KEY","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE","SUCCESSOR"]
headings31=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","ORG_KEY","ORGANISATION","LEGAL_NAME","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE"]
headings32=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","UPRN","CLASS_KEY","CLASSIFICATION_CODE","CLASS_SCHEME","SCHEME_VERSION","START_DATE","END_DATE","LAST_UPDATE_DATE","ENTRY_DATE"]
headings99=["RECORD_IDENTIFIER","NEXT_VOLUME_NUMBER","RECORD_COUNT","ENTRY_DATE","TIME_STAMP"]
def createCSV():
starttime = time.time()
#Rather than using arguments the program asks the user to input the path to the folder of OS AddressBase Premium files
print 'Please type in the full path to the directory of OS AddressBase zip files:'
directorypath = raw_input('Directory Path: ')
#These record types are used to test whether a CSV file found by the program are in fact OS AddressBase Premium files
recordtypes = ("10", "11", "15", "21", "23", "24", "28", "29", "30", "31", "32", "99")
#An emtpy array and counter used to store and count the number of CSV files the program finds
csvfileList = []
csvfileCount = 0
#An emtpy array and counter used to store and count the number of Zip files the program finds
zipfileList = []
zipfileCount = 0
#The following code walks the directory path that the user input earlier and searches for either CSV files or Zip files
#It will intially see if AddressBasePremium is in the filename if not it will then check the first value of the file against
#the record types above. Obviously is there is a CSV in the folder that has one of these record types as a first value
#it will also be included.
for dirname, dirnames, filenames in os.walk(directorypath):
for filename in filenames:
if filename.endswith(".csv"):
csvfile = os.path.join(directorypath, filename)
if "AddressBasePremium" in filename:
csvfileList.append(csvfile)
csvfileCount = csvfileCount + 1
else:
with open(csvfile) as f:
csvreader = csv.reader(f, delimiter=',', quotechar='"')
for row in csvreader:
if csvreader.line_num == 1:
identifier = row[0]
if identifier in recordtypes:
csvfileList.append(csvfile)
csvfileCount = csvfileCount + 1
else:
pass
#There are two types of zip file Non-geo chunks and Geo chunks and they have two different file name conventions
#So below are two try statements to deal with the Non-geo chunk zip files which have _csv at the end of the zip
#file name but not of the actual CSV file inside the zipfile. So we can remove the last 4 characters to get
#them to match
if filename.endswith(".zip"):
zippath = os.path.join(directorypath, filename)
if "AddressBasePremium" in filename:
zipfileList.append(zippath)
zipfileCount = zipfileCount + 1
else:
shortzipfile = os.path.splitext(filename)[0]
removecsvzip = shortzipfile[0:-4]
zipcsv = '.'.join([shortzipfile, 'csv'])
zipcsv2 = '.'.join([removecsvzip, 'csv'])
with open(zippath, 'rb') as zname:
zfile = zipfile.ZipFile(zname)
try:
data = StringIO.StringIO(zfile.read(zipcsv))
csvreader = csv.reader(data, delimiter=',', quotechar='"')
for row in csvreader:
if csvreader.line_num == 1:
identifier = row[0]
if identifier in recordtypes:
zipfileList.append(zippath)
zipfileCount = zipfileCount + 1
except KeyError as e:
data = StringIO.StringIO(zfile.read(zipcsv2))
csvreader = csv.reader(data, delimiter=',', quotechar='"')
for row in csvreader:
if csvreader.line_num == 1:
identifier = row[0]
if identifier in recordtypes:
zipfileList.append(zippath)
zipfileCount = zipfileCount + 1
finally:
pass
#The following section makes sure that it is dealing with the correct files, either CSV or Zip but not both types
try:
if csvfileCount > 0 and zipfileCount > 0:
print "Program has found both OS AddressBase Premium CSV and ZIP files."
print "Please tidy up the folder of files and try again"
time.sleep(5)
sys.exit()
else:
pass
if csvfileCount > 0:
print "Program has found %s OS AddressBase Premium CSV Files" %csvfileCount
else:
pass
if zipfileCount > 0:
print "Program has found %s OS AddressBase Premium Zipped CSV Files" %zipfileCount
else:
pass
if csvfileCount == 0 and zipfileCount == 0:
print "Program could not find any OS AddressBase Premium files and will now exit"
time.sleep(5)
sys.exit()
finally:
pass
#Create the Header_10.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Header_10.csv'):
os.remove('Header_10.csv')
header_10 = open('Header_10.csv', 'a')
write10 = csv.writer(header_10, delimiter=',', quotechar='"', lineterminator='\n')
write10.writerow(headings10)
else:
header_10 = open('Header_10.csv', 'a')
write10 = csv.writer(header_10, delimiter=',', quotechar='"', lineterminator='\n')
write10.writerow(headings10)
#Create the Street_11.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Street_11.csv'):
os.remove('Street_11.csv')
street_11 = open('Street_11.csv', 'a')
write11 = csv.writer(street_11, delimiter=',', quotechar='"', lineterminator='\n')
write11.writerow(headings11)
else:
street_11 = open('Street_11.csv', 'a')
write11 = csv.writer(street_11, delimiter=',', quotechar='"', lineterminator='\n')
write11.writerow(headings11)
#Create the StreetDesc_15.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('StreetDesc_15.csv'):
os.remove('StreetDesc_15.csv')
streetdesc_15 = open('StreetDesc_15.csv', 'a')
write15 = csv.writer(streetdesc_15, delimiter=',', quotechar='"', lineterminator='\n')
write15.writerow(headings15)
else:
streetdesc_15 = open('StreetDesc_15.csv', 'a')
write15 = csv.writer(streetdesc_15, delimiter=',', quotechar='"', lineterminator='\n')
write15.writerow(headings15)
#Create the BLPU_21.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('BLPU_21.csv'):
os.remove('BLPU_21.csv')
blpu_21 = open('BLPU_21.csv', 'a')
write21 = csv.writer(blpu_21, delimiter=',', quotechar='"', lineterminator='\n')
write21.writerow(headings21)
else:
blpu_21 = open('BLPU_21.csv', 'a')
write21 = csv.writer(blpu_21, delimiter=',', quotechar='"', lineterminator='\n')
write21.writerow(headings21)
#Create the XREF_23.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('XREF_23.csv'):
os.remove('XREF_23.csv')
xref_23 = open('XREF_23.csv', 'a')
write23 = csv.writer(xref_23, delimiter=',', quotechar='"', lineterminator='\n')
write23.writerow(headings23)
else:
xref_23 = open('XREF_23.csv', 'a')
write23 = csv.writer(xref_23, delimiter=',', quotechar='"', lineterminator='\n')
write23.writerow(headings23)
#Create the LPI_24.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('LPI_24.csv'):
os.remove('LPI_24.csv')
lpi_24 = open('LPI_24.csv', 'a')
write24 = csv.writer(lpi_24, delimiter=',', quotechar='"', lineterminator='\n')
write24.writerow(headings24)
else:
lpi_24 = open('LPI_24.csv', 'a')
write24 = csv.writer(lpi_24, delimiter=',', quotechar='"', lineterminator='\n')
write24.writerow(headings24)
#Create the DP_28.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('DP_28.csv'):
os.remove('DP_28.csv')
dp_28 = open('DP_28.csv', 'a')
write28 = csv.writer(dp_28, delimiter=',', quotechar='"', lineterminator='\n')
write28.writerow(headings28)
else:
dp_28 = open('DP_28.csv', 'a')
write28 = csv.writer(dp_28, delimiter=',', quotechar='"', lineterminator='\n')
write28.writerow(headings28)
#Create the Meta_29.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Meta_29.csv'):
os.remove('Meta_29.csv')
meta_29 = open('Meta_29.csv', 'a')
write29 = csv.writer(meta_29, delimiter=',', quotechar='"', lineterminator='\n')
write29.writerow(headings29)
else:
meta_29 = open('Meta_29.csv', 'a')
write29 = csv.writer(meta_29, delimiter=',', quotechar='"', lineterminator='\n')
write29.writerow(headings29)
#Create the Suc_30.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Suc_30.csv'):
os.remove('Suc_30.csv')
suc_30 = open('Suc_30.csv', 'a')
write30 = csv.writer(suc_30, delimiter=',', quotechar='"', lineterminator='\n')
write30.writerow(headings30)
else:
suc_30 = open('Suc_30.csv', 'a')
write30 = csv.writer(suc_30, delimiter=',', quotechar='"', lineterminator='\n')
write30.writerow(headings30)
#Create the Org_31.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Org_31.csv'):
os.remove('Org_31.csv')
org_31 = open('Org_31.csv', 'a')
write31 = csv.writer(org_31, delimiter=',', quotechar='"', lineterminator='\n')
write31.writerow(headings31)
else:
org_31 = open('Org_31.csv', 'a')
write31 = csv.writer(org_31, delimiter=',', quotechar='"', lineterminator='\n')
write31.writerow(headings31)
#Create the Class_32.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
class_32 = open('Class_3if os.path.isfile('Class_32.csv'):
os.remove('Class_32.csv')2.csv', 'a')
write32 = csv.writer(class_32, delimiter=',', quotechar='"', lineterminator='\n')
write32.writerow(headings32)
else:
class_32 = open('Class_32.csv', 'a')
write32 = csv.writer(class_32, delimiter=',', quotechar='"', lineterminator='\n')
write32.writerow(headings32)
#Create the Trailer_99.csv file, it checks to see if it exists first, if so deletes it, then creates a fresh one
if os.path.isfile('Trailer_99.csv'):
os.remove('Trailer_99.csv')
trailer_99 = open('Trailer_99.csv', 'a')
write99 = csv.writer(trailer_99, delimiter=',', quotechar='"', lineterminator='\n')
write99.writerow(headings99)
else:
trailer_99 = open('Trailer_99.csv', 'a')
write99 = csv.writer(trailer_99, delimiter=',', quotechar='"', lineterminator='\n')
write99.writerow(headings99)
print 'Finished creating empty CSV files with Header line'
#The following coutners are used to keep track of how many records of each Record Identifier type are found
counter10 = 0
counter11 = 0
counter15 = 0
counter21 = 0
counter23 = 0
counter24 = 0
counter28 = 0
counter29 = 0
counter30 = 0
counter31 = 0
counter32 = 0
counter99 = 0
#Counter to keep track of the number of files processed
processed = 0
#There is a different routine for processing CSV files compared to ZIP files
#This sections processes the CSV files using the Python CSV reader and write modules
#It used the first value of the row to dtermine which CSV file that row should be written to.
if csvfileCount > 0:
print "Program will now split the OS AddressBase Premium files"
for filepath in csvfileList:
processed = processed + 1
print "Processing file number " + str(processed) + " out of " + str(csvfileCount)
with open(filepath) as f:
csvreader = csv.reader(f, delimiter=',', quotechar='"')
try:
for row in csvreader:
abtype = row[0]
if "10" in abtype:
write10.writerow(row)
counter10 = counter10 + 1
elif "11" in abtype:
write11.writerow(row)
counter11 = counter11 + 1
elif "15" in abtype:
write15.writerow(row)
counter15 = counter15 + 1
elif "21" in abtype:
write21.writerow(row)
counter21 = counter21 + 1
elif "23" in abtype:
write23.writerow(row)
counter23 = counter23 + 1
elif "24" in abtype:
write24.writerow(row)
counter24 = counter24 + 1
elif "28" in abtype:
write28.writerow(row)
counter28 = counter28 + 1
elif "29" in abtype:
write29.writerow(row)
counter29 = counter29 + 1
elif "30" in abtype:
write30.writerow(row)
counter30 = counter30 + 1
elif "31" in abtype:
write31.writerow(row)
counter31 = counter31 + 1
elif "32" in abtype:
write32.writerow(row)
counter32 = counter32 + 1
elif "99" in abtype:
write99.writerow(row)
counter99 = counter99 + 1
else:
pass
except KeyError as e:
pass
header_10.close()
street_11.close()
streetdesc_15.close()
blpu_21.close()
xref_23.close()
lpi_24.close()
dp_28.close()
meta_29.close()
suc_30.close()
org_31.close()
class_32.close()
trailer_99.close()
else:
pass
#The following section processes the ZIp files.
#It uses the Python Zipfile module to read the data directly from the Zip file preventing the user having
#to extract the files before splitting the data.
if zipfileCount > 0:
print "Program will now split the OS AddressBase Premium files"
for filepath in zipfileList:
processed = processed + 1
print "Processing file number " + str(processed) + " out of " + str(zipfileCount)
basename = os.path.basename(filepath)
shortzipfile = os.path.splitext(basename)[0]
removecsvzip = shortzipfile[0:-4]
zipcsv = '.'.join([shortzipfile, 'csv'])
zipcsv2 = '.'.join([removecsvzip, 'csv'])
with open(filepath, 'rb') as zname:
zfile = zipfile.ZipFile(zname)
try:
data = StringIO.StringIO(zfile.read(zipcsv))
csvreader = csv.reader(data, delimiter=',', quotechar='"')
for row in csvreader:
abtype = row[0]
if "10" in abtype:
write10.writerow(row)
counter10 = counter10 + 1
elif "11" in abtype:
write11.writerow(row)
counter11 = counter11 + 1
elif "15" in abtype:
write15.writerow(row)
counter15 = counter15 + 1
elif "21" in abtype:
write21.writerow(row)
counter21 = counter21 + 1
elif "23" in abtype:
write23.writerow(row)
counter23 = counter23 + 1
elif "24" in abtype:
write24.writerow(row)
counter24 = counter24 + 1
elif "28" in abtype:
write28.writerow(row)
counter28 = counter28 + 1
elif "29" in abtype:
write29.writerow(row)
counter29 = counter29 + 1
elif "30" in abtype:
write30.writerow(row)
counter30 = counter30 + 1
elif "31" in abtype:
write31.writerow(row)
counter31 = counter31 + 1
elif "32" in abtype:
write32.writerow(row)
counter32 = counter32 + 1
elif "99" in abtype:
write99.writerow(row)
counter99 = counter99 + 1
else:
pass
except KeyError as e:
data = StringIO.StringIO(zfile.read(zipcsv2))
csvreader = csv.reader(data, delimiter=',', quotechar='"')
for row in csvreader:
abtype = row[0]
if "10" in abtype:
write10.writerow(row)
counter10 = counter10 + 1
elif "11" in abtype:
write11.writerow(row)
counter11 = counter11 + 1
elif "15" in abtype:
write15.writerow(row)
counter15 = counter15 + 1
elif "21" in abtype:
write21.writerow(row)
counter21 = counter21 + 1
elif "23" in abtype:
write23.writerow(row)
counter23 = counter23 + 1
elif "24" in abtype:
write24.writerow(row)
counter24 = counter24 + 1
elif "28" in abtype:
write28.writerow(row)
counter28 = counter28 + 1
elif "29" in abtype:
write29.writerow(row)
counter29 = counter29 + 1
elif "30" in abtype:
write30.writerow(row)
counter30 = counter30 + 1
elif "31" in abtype:
write31.writerow(row)
counter31 = counter31 + 1
elif "32" in abtype:
write32.writerow(row)
counter32 = counter32 + 1
elif "99" in abtype:
write99.writerow(row)
counter99 = counter99 + 1
else:
pass
finally:
pass
header_10.close()
street_11.close()
streetdesc_15.close()
blpu_21.close()
xref_23.close()
lpi_24.close()
dp_28.close()
meta_29.close()
suc_30.close()
org_31.close()
class_32.close()
trailer_99.close()
endtime = time.time()
elapsed = endtime - starttime
#Summary statistics showing number of records and time taken
print "Program has finished splitting the OS AddressBase Premium Files"
print 'Finished translating data at ', strftime("%a, %d %b %Y %H:%M:%S")
print 'Elapsed time: ', round(elapsed/60,1), ' minutes'
print "Number of Header Records: %s" %str(counter10)
print "Number of Street Records: %s" %str(counter11)
print "Number of Street Descriptor Records: %s" %str(counter15)
print "Number of BLPU Records: %s" %str(counter21)
print "Number of XRef Records: %s" %str(counter23)
print "Number of LPI Records: %s" %str(counter24)
print "Number of Delivery Point Records: %s" %str(counter28)
print "Number of Metadata Records: %s" %str(counter29)
print "Number of Successor Records: %s" %str(counter30)
print "Number of Organisation Records: %s" %str(counter31)
print "Number of Classification Records: %s" %str(counter32)
print "Number of Trailer Records: %s" %str(counter99)
def main():
createCSV()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment