sh0rtwave · August 16, 2021 15:01
diff --git a/ProcessGeoJSONForSalesforce.py b/ProcessGeoJSONForSalesforce.py
 import json
 import csv
 from geojson_rewind import rewind

 def is_clockwise(poly):															# linear-ring analysis function, checks for directionality of ring winding
 	total = poly[-1][0] * poly[0][1] - poly[0][0] * poly[-1][1]
 	for i in range(len(poly) - 1):
 		total += poly[i][0] * poly[i + 1][1] - poly[i + 1][0] * poly[i][1]

 	if total <= 0:
 		return True
 	else:
 		return False

 def centroid(*points):															# centroid calculation function. Effectively takes the mean of all points in the array to generate a
 	x_coords = [p[0] for p in points]											# center point location
 	y_coords = [p[1] for p in points]
 	_len = len(points)
 	cent_x_tot = 0
 	cent_y_tot = 0
 	for i in range(_len):
 		cent_x_tot += x_coords[i][0]
 		cent_y_tot += y_coords[i][1]
 	centroid_x = cent_x_tot/_len
 	centroid_y = cent_y_tot/_len
 	return [centroid_x, centroid_y]


 																				# Create a lookup table for Territories, so we can merge
 																				# the coordinate tables with each Territory
 csv_rows_by_zip = {}

 print("Load SalesForce Territories")
 with open('./Territories__c__Zip+ID.csv') as csvIn:								# open the territory data file
 	dictReader = csv.DictReader(csvIn, delimiter=',', quotechar='"')
 	for row in dictReader:														# load that entire list of territories into a lookup table, indexed by Zip code (a string)
 		csv_rows_by_zip[row['Five_Digit_Zipcode__c']] = row

 print("Opening output file")
 csvFileOut =  open('./SF_Zips_With_GeoBounds.csv', 'w', newline='')				# open the OUT file - If this file exists, it gets overwritten dog
 fieldNames = ['Id',
 			'Five_Digit_Zipcode__c',
 			'geoType',
 			'centroid_x__c',
 			'centroid_y__c',
 			'GeoStr_A__c', 'GeoStr_B__c', 'GeoStr_C__c', 'GeoStr_D__c', 'GeoStr_E__c', 'GeoStr_F__c', 'GeoStr_G__c',
 			'GeoStr_H__c', 'GeoStr_I__c', 'GeoStr_J__c', 'GeoStr_K__c', 'GeoStr_L__c', 'GeoStr_M__c', 'GeoStr_N__c',
 			'GeoStr_O__c', 'GeoStr_P__c', 'GeoStr_Q__c', 'GeoStr_R__c', 'GeoStr_S__c', 'GeoStr_T__c', 'GeoStr_U__c',
 			'GeoStr_V__c', 'GeoStr_W__c', 'GeoStr_X__c', 'GeoStr_Y__c', 'GeoStr_Z__c', 'GeoStr_AA__c', 'GeoStr_AB__c',
 			'GeoStr_AC__c', 'GeoStr_AD__c', 'GeoStr_AE__c', 'GeoStr_AF__c', 'GeoStr_AG__c', 'GeoStr_AH__c', 'GeoStr_AI__c',
 			'GeoStr_AJ__c', 'GeoStr_AK__c', 'GeoStr_AL__c', 'GeoStr_AM__c', 'GeoStr_AN__c', 'GeoStr_AO__c', 'GeoStr_AP__c',
 			'GeoStr_AQ__c', 'GeoStr_AR__c', 'GeoStr_AS__c', 'GeoStr_AT__c', 'GeoStr_AU__c', 'GeoStr_AV__c', 'GeoStr_AW__c',
 			'GeoStr_AX__c', 'GeoStr_AY__c', 'GeoStr_AZ__c'
 			]


 print("Generating CSV header")
 print(fieldNames)
 dictWriter = csv.DictWriter(csvFileOut, 										# open an output file handle for the CSV data
 							fieldnames=fieldNames,
 							delimiter=',',
 							quotechar='"',
 							dialect='unix' )

 dictWriter.writeheader() 														# and write the CSV header to that file

 print("loading GeoJSON")
 with open('./zcta5.geo.json') as geoFile:										# open and load the GeoJSON file
 	data = json.load(geoFile)

 features = data.get('features')													# get the feature list from it
 geoStrMaxLength = 0
 recordsWritten = 0
 recordsSkippedMissingGeo = 0
 recordsSkippedList = []
 recordsSkippedMissingSF_Record = 0
 cellsContinued = 0

 # how many features we got to swizzle through?
 featureCount = len(features)

 for featureIdx in range(featureCount):											# iterate the feature list, extracting and correcting each
 	feature = features[featureIdx]

 	geo = feature['geometry']													# extract the geometry object from the feature. We can reconstitute everything else in the GeoJSON from field properties

 	if (geo is not None):														# look to make sure it's actually there...
 		polygons = []

 		# stage 1, is to test the geometry for the correct ring winding order

 		# extract geo-coordinate data. In the case of MultiPolyons, it's an array that has to be iterated
 		# in the case of a polygon, it's a single GeoJSON polygon. To treat these both in the same way,
 		# polygons are temporarily wrapped into an array similar to MultiPolygons, and processed in the same
 		# way

 		if geo["type"] == 'MultiPolygon':
 			try:
 				polygons = geo['coordinates']									# A multipolygon contains an array of polys. So it's an array here
 			except:
 				recordsSkippedMissingGeo +=1
 				continue
 		else:
 			try:
 				polygons = [ geo['coordinates'] ] 								# note that this wraps a single polygon data object(An array), into a list,
 																				# so we only have to write a list processor to do all the work, simulating multipolygons
 			except:
 				recordsSkippedMissingGeo +=1
 				continue

 		polygonCount = len(polygons)											# how many polygons to iterate?

 		for polyIndex in range(polygonCount):
 			poly = polygons[polyIndex]											# get the poly
 			for polyLengthStep in range(len(poly)):								# iterate the polygons
 				ring = poly[0] 													# get the first ring (the exterior)
 				if is_clockwise(ring) != True: 									# test it for clockwise-ness
 					polygons[polyLengthStep] = correctRings(poly) 				# repair if necessary

 		zip = feature['properties']['GEOID10'] 									# GEOID10 is the GEOID of the Zone we're interested, or the ZIP Code.

 		if zip in csv_rows_by_zip:
 			row = csv_rows_by_zip[zip] 											# grab the CSV row from SF
 			row['geoType'] = geo['type'][0]										# extract the polygon type specifier (MultiPolygon/Polygon)
 		else:
 			recordsSkippedMissingSF_Record += 1
 			continue

 		# stage 2 is to generate a centroid value so that each record
 		# can be queried for whether or not it falls within a geographical area
 		# note that for MultiPolygon rings, we first generate a centroid
 		# for each ring, and then using that group of centroids,
 		# generate a centroid for the GROUP of the polygons

 		if geo['type'] == "MultiPolygon":
 			coords = polygons													# assign polygons to geo data straight in
 			ringCentroids = []													# make a variable to hold the temporary list
 			for ring in coords:													# loop through the polygons in the coordinate list
 				cent_coord = ring[0]											# grab each ring
 				ringCentroids.append(centroid(cent_coord))						# generate a centroid, and push it into the ringCentroids array
 			groupCentroid = centroid(ringCentroids)								# after crunching the individual rings, generate a new centroid using the group of centroids we got from the polygon list
 			row['centroid_x__c'] = groupCentroid[0]								# store values in the row
 			row['centroid_y__c'] = groupCentroid[1]
 		else:
 			coords = polygons[0][0]												# if only a poly, then it's an array of ONE polygon, we only need to generate one centroid
 			polyCent = centroid(coords)											# get the centroid using just the bare list of coordinates
 			row['centroid_x__c'] = polyCent[0]									# store values in the row
 			row['centroid_y__c'] = polyCent[1]

 		# stage 3: Merge the corrected polygon data with the Salesforce Territory objects via their ZipCode fields.

 		geoStr = json.dumps(coords, separators=(',',':'))						# convert the geometry to a JSON string
 		geoStrLength = len(geoStr) 												# get the size of the resulting string

 		if geoStrLength > geoStrMaxLength:
 			geoStrMaxLength = geoStrLength										# Accumalated maximum size of the geoStr, required to project final required storage

 		SF_Field_Size_Limit = 32000												# max possible string size PER FIELD

 		steps = int(geoStrLength / SF_Field_Size_Limit) + 1 					# add 1 extra step to handle remainder < 1
 		for step in range(steps):												# iterate steps
 			fieldName = fieldNames[step + 5]									# extract the target field name, from our list of fields, offset by 5 to skip the first 5 columns
 			stepOff = step * SF_Field_Size_Limit								# calculate offset from the starting point of the geostring, depending upon how far along we are
 			geoSlice = geoStr[stepOff:stepOff+SF_Field_Size_Limit]				# extract a slice of the right size from the string
 			##print("Writing field ", fieldName)
 			row[fieldName] = geoSlice											# stuff that slice into the destination row

 			# The SF dataloader will only accept records < 400,000 characters in length.
 			# Additionally, the Einstein Analytics Data set importer, can only accept cells of 32K characters length,
 			# this means we can't use the full width of the SF custom object. We're forced to use cells of 32 K in length,
 			# OR, 12 fields to be output at a time.
 			# For a long zip coordinate set that covers more than 12 fields, we have to emit
 			# an export record for each group of up to 12 cells that the text data consumes.

 			# to compensate for this, iterate the cells, and for every field up to the last field in the
 			# row will be exported to comprise update records that each contain 3 cells or the remainder of the text

 			#

 			if step == steps - 1:												# if we run out of steps, write the output file
 				#print("Writing end record: " + row['Five_Digit_Zipcode__c'])
 				dictWriter.writerow(row)
 				recordsWritten += 1
 			elif (step + 1) % 12 == 0:											# otherwise, on each 12th step, emit a record as the DataLoader's record-length limit will hit
 				print("starting continuation record: " + row['Five_Digit_Zipcode__c'])
 				dictWriter.writerow(row)
 				cellsContinued +=1
 				recordsWritten += 1
 				newRow = { 														# generate a new copy of this row to be the next export record
 					"Id" : row["Id"],
 					"Five_Digit_Zipcode__c" : row["Five_Digit_Zipcode__c"],
 					"geoType" : row["geoType"],
 					"centroid_x__c" : row['centroid_x__c'],
 					"centroid_y__c" : row["centroid_y__c"]
 				}
 				row = newRow
 				# this object then replaces the row that was previously generated for this Feature
 				# on the next iteration of this loop, the script will be operating on the
 				# continuation record for this Feature

 	else:
 		recordsSkippedMissingGeo += 1


 print("Results:")
 print('Records not processed:', recordsSkippedMissingGeo)
 print('Missing SF Zip records: ', recordsSkippedMissingSF_Record)
 print("Records processed:", featureCount)
 print("Records exported: ", recordsWritten)
 print("Contination Records:", cellsContinued)
 print("Required GeoJSON Column Size", geoStrMaxLength)
 quit()
	import json
	import csv
	from geojson_rewind import rewind

	def is_clockwise(poly): # linear-ring analysis function, checks for directionality of ring winding
	total = poly[-1][0] * poly[0][1] - poly[0][0] * poly[-1][1]
	for i in range(len(poly) - 1):
	total += poly[i][0] * poly[i + 1][1] - poly[i + 1][0] * poly[i][1]

	if total <= 0:
	return True
	else:
	return False

	def centroid(*points): # centroid calculation function. Effectively takes the mean of all points in the array to generate a
	x_coords = [p[0] for p in points] # center point location
	y_coords = [p[1] for p in points]
	_len = len(points)
	cent_x_tot = 0
	cent_y_tot = 0
	for i in range(_len):
	cent_x_tot += x_coords[i][0]
	cent_y_tot += y_coords[i][1]
	centroid_x = cent_x_tot/_len
	centroid_y = cent_y_tot/_len
	return [centroid_x, centroid_y]


	# Create a lookup table for Territories, so we can merge
	# the coordinate tables with each Territory
	csv_rows_by_zip = {}

	print("Load SalesForce Territories")
	with open('./Territories__c__Zip+ID.csv') as csvIn: # open the territory data file
	dictReader = csv.DictReader(csvIn, delimiter=',', quotechar='"')
	for row in dictReader: # load that entire list of territories into a lookup table, indexed by Zip code (a string)
	csv_rows_by_zip[row['Five_Digit_Zipcode__c']] = row

	print("Opening output file")
	csvFileOut = open('./SF_Zips_With_GeoBounds.csv', 'w', newline='') # open the OUT file - If this file exists, it gets overwritten dog
	fieldNames = ['Id',
	'Five_Digit_Zipcode__c',
	'geoType',
	'centroid_x__c',
	'centroid_y__c',
	'GeoStr_A__c', 'GeoStr_B__c', 'GeoStr_C__c', 'GeoStr_D__c', 'GeoStr_E__c', 'GeoStr_F__c', 'GeoStr_G__c',
	'GeoStr_H__c', 'GeoStr_I__c', 'GeoStr_J__c', 'GeoStr_K__c', 'GeoStr_L__c', 'GeoStr_M__c', 'GeoStr_N__c',
	'GeoStr_O__c', 'GeoStr_P__c', 'GeoStr_Q__c', 'GeoStr_R__c', 'GeoStr_S__c', 'GeoStr_T__c', 'GeoStr_U__c',
	'GeoStr_V__c', 'GeoStr_W__c', 'GeoStr_X__c', 'GeoStr_Y__c', 'GeoStr_Z__c', 'GeoStr_AA__c', 'GeoStr_AB__c',
	'GeoStr_AC__c', 'GeoStr_AD__c', 'GeoStr_AE__c', 'GeoStr_AF__c', 'GeoStr_AG__c', 'GeoStr_AH__c', 'GeoStr_AI__c',
	'GeoStr_AJ__c', 'GeoStr_AK__c', 'GeoStr_AL__c', 'GeoStr_AM__c', 'GeoStr_AN__c', 'GeoStr_AO__c', 'GeoStr_AP__c',
	'GeoStr_AQ__c', 'GeoStr_AR__c', 'GeoStr_AS__c', 'GeoStr_AT__c', 'GeoStr_AU__c', 'GeoStr_AV__c', 'GeoStr_AW__c',
	'GeoStr_AX__c', 'GeoStr_AY__c', 'GeoStr_AZ__c'
	]


	print("Generating CSV header")
	print(fieldNames)
	dictWriter = csv.DictWriter(csvFileOut, # open an output file handle for the CSV data
	fieldnames=fieldNames,
	delimiter=',',
	quotechar='"',
	dialect='unix' )

	dictWriter.writeheader() # and write the CSV header to that file

	print("loading GeoJSON")
	with open('./zcta5.geo.json') as geoFile: # open and load the GeoJSON file
	data = json.load(geoFile)

	features = data.get('features') # get the feature list from it
	geoStrMaxLength = 0
	recordsWritten = 0
	recordsSkippedMissingGeo = 0
	recordsSkippedList = []
	recordsSkippedMissingSF_Record = 0
	cellsContinued = 0

	# how many features we got to swizzle through?
	featureCount = len(features)

	for featureIdx in range(featureCount): # iterate the feature list, extracting and correcting each
	feature = features[featureIdx]

	geo = feature['geometry'] # extract the geometry object from the feature. We can reconstitute everything else in the GeoJSON from field properties

	if (geo is not None): # look to make sure it's actually there...
	polygons = []

	# stage 1, is to test the geometry for the correct ring winding order

	# extract geo-coordinate data. In the case of MultiPolyons, it's an array that has to be iterated
	# in the case of a polygon, it's a single GeoJSON polygon. To treat these both in the same way,
	# polygons are temporarily wrapped into an array similar to MultiPolygons, and processed in the same
	# way

	if geo["type"] == 'MultiPolygon':
	try:
	polygons = geo['coordinates'] # A multipolygon contains an array of polys. So it's an array here
	except:
	recordsSkippedMissingGeo +=1
	continue
	else:
	try:
	polygons = [ geo['coordinates'] ] # note that this wraps a single polygon data object(An array), into a list,
	# so we only have to write a list processor to do all the work, simulating multipolygons
	except:
	recordsSkippedMissingGeo +=1
	continue

	polygonCount = len(polygons) # how many polygons to iterate?

	for polyIndex in range(polygonCount):
	poly = polygons[polyIndex] # get the poly
	for polyLengthStep in range(len(poly)): # iterate the polygons
	ring = poly[0] # get the first ring (the exterior)
	if is_clockwise(ring) != True: # test it for clockwise-ness
	polygons[polyLengthStep] = correctRings(poly) # repair if necessary

	zip = feature['properties']['GEOID10'] # GEOID10 is the GEOID of the Zone we're interested, or the ZIP Code.

	if zip in csv_rows_by_zip:
	row = csv_rows_by_zip[zip] # grab the CSV row from SF
	row['geoType'] = geo['type'][0] # extract the polygon type specifier (MultiPolygon/Polygon)
	else:
	recordsSkippedMissingSF_Record += 1
	continue

	# stage 2 is to generate a centroid value so that each record
	# can be queried for whether or not it falls within a geographical area
	# note that for MultiPolygon rings, we first generate a centroid
	# for each ring, and then using that group of centroids,
	# generate a centroid for the GROUP of the polygons

	if geo['type'] == "MultiPolygon":
	coords = polygons # assign polygons to geo data straight in
	ringCentroids = [] # make a variable to hold the temporary list
	for ring in coords: # loop through the polygons in the coordinate list
	cent_coord = ring[0] # grab each ring
	ringCentroids.append(centroid(cent_coord)) # generate a centroid, and push it into the ringCentroids array
	groupCentroid = centroid(ringCentroids) # after crunching the individual rings, generate a new centroid using the group of centroids we got from the polygon list
	row['centroid_x__c'] = groupCentroid[0] # store values in the row
	row['centroid_y__c'] = groupCentroid[1]
	else:
	coords = polygons[0][0] # if only a poly, then it's an array of ONE polygon, we only need to generate one centroid
	polyCent = centroid(coords) # get the centroid using just the bare list of coordinates
	row['centroid_x__c'] = polyCent[0] # store values in the row
	row['centroid_y__c'] = polyCent[1]

	# stage 3: Merge the corrected polygon data with the Salesforce Territory objects via their ZipCode fields.

	geoStr = json.dumps(coords, separators=(',',':')) # convert the geometry to a JSON string
	geoStrLength = len(geoStr) # get the size of the resulting string

	if geoStrLength > geoStrMaxLength:
	geoStrMaxLength = geoStrLength # Accumalated maximum size of the geoStr, required to project final required storage

	SF_Field_Size_Limit = 32000 # max possible string size PER FIELD

	steps = int(geoStrLength / SF_Field_Size_Limit) + 1 # add 1 extra step to handle remainder < 1
	for step in range(steps): # iterate steps
	fieldName = fieldNames[step + 5] # extract the target field name, from our list of fields, offset by 5 to skip the first 5 columns
	stepOff = step * SF_Field_Size_Limit # calculate offset from the starting point of the geostring, depending upon how far along we are
	geoSlice = geoStr[stepOff:stepOff+SF_Field_Size_Limit] # extract a slice of the right size from the string
	##print("Writing field ", fieldName)
	row[fieldName] = geoSlice # stuff that slice into the destination row

	# The SF dataloader will only accept records < 400,000 characters in length.
	# Additionally, the Einstein Analytics Data set importer, can only accept cells of 32K characters length,
	# this means we can't use the full width of the SF custom object. We're forced to use cells of 32 K in length,
	# OR, 12 fields to be output at a time.
	# For a long zip coordinate set that covers more than 12 fields, we have to emit
	# an export record for each group of up to 12 cells that the text data consumes.

	# to compensate for this, iterate the cells, and for every field up to the last field in the
	# row will be exported to comprise update records that each contain 3 cells or the remainder of the text

	#

	if step == steps - 1: # if we run out of steps, write the output file
	#print("Writing end record: " + row['Five_Digit_Zipcode__c'])
	dictWriter.writerow(row)
	recordsWritten += 1
	elif (step + 1) % 12 == 0: # otherwise, on each 12th step, emit a record as the DataLoader's record-length limit will hit
	print("starting continuation record: " + row['Five_Digit_Zipcode__c'])
	dictWriter.writerow(row)
	cellsContinued +=1
	recordsWritten += 1
	newRow = { # generate a new copy of this row to be the next export record
	"Id" : row["Id"],
	"Five_Digit_Zipcode__c" : row["Five_Digit_Zipcode__c"],
	"geoType" : row["geoType"],
	"centroid_x__c" : row['centroid_x__c'],
	"centroid_y__c" : row["centroid_y__c"]
	}
	row = newRow
	# this object then replaces the row that was previously generated for this Feature
	# on the next iteration of this loop, the script will be operating on the
	# continuation record for this Feature

	else:
	recordsSkippedMissingGeo += 1


	print("Results:")
	print('Records not processed:', recordsSkippedMissingGeo)
	print('Missing SF Zip records: ', recordsSkippedMissingSF_Record)
	print("Records processed:", featureCount)
	print("Records exported: ", recordsWritten)
	print("Contination Records:", cellsContinued)
	print("Required GeoJSON Column Size", geoStrMaxLength)
	quit()