Created
May 11, 2022 21:03
-
-
Save sebastianknopf/e9e0381efab5fc872d726c28eca1b19f to your computer and use it in GitHub Desktop.
unifies objects to a comparable (hash) string regardless of their spelling - use case: checking persons and addresses against a blacklist without storing personal data permanently in database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import re | |
import unittest | |
from abc import abstractmethod | |
class ObjectUnifier: | |
_token_list = [] | |
_token_delimiter = "[\\s+\\.]+" | |
def _add_token(self, token): | |
token = token.lower() | |
token = re.sub("[^a-z0-9-]+", "_", token) | |
self._token_list.append(str(token)) | |
@abstractmethod | |
def _generate_token_list(self): | |
pass | |
def get_unified_object_string(self): | |
return "#".join(self._token_list) | |
def get_unified_object_hash(self): | |
return hashlib.sha256(self.get_unified_object_string().encode("utf-8")).hexdigest() | |
def __str__(self): | |
return self.get_unified_object_string() | |
class PersonUnifier(ObjectUnifier): | |
_gender = None | |
_last_name = None | |
_first_name = None | |
_birthday = None | |
def __init__(self, gender, last_name, first_name, birthday): | |
self._token_list = [] | |
self._gender = gender | |
self._last_name = last_name | |
self._first_name = first_name | |
self._birthday = birthday | |
self._generate_token_list() | |
def _generate_token_list(self): | |
self._add_token(self._gender) | |
self._add_token(self._last_name) | |
first_name_value = re.split(self._token_delimiter, self._first_name)[0] | |
self._add_token(first_name_value) | |
self._add_token(self._birthday) | |
class AddressUnifier(ObjectUnifier): | |
_street = None | |
_house_number = None | |
_postal_code = None | |
_city = None | |
def __init__(self, street, house_number, postal_code, city): | |
self._token_list = [] | |
self._street = street | |
self._house_number = house_number | |
self._postal_code = postal_code | |
self._city = city | |
self._generate_token_list() | |
def _generate_token_list(self): | |
street = re.split("[\\s.-]+", self._street) | |
street = list(filter(lambda x: x != "", street)) | |
street_value = [] | |
for i in range(0, len(street)): | |
sv = street[i] | |
sv = sv.ljust(3, "_") | |
if i < len(street) - 1 and len(sv) > 3: | |
sv = sv[:3] | |
if i == len(street) - 1: | |
sv = sv.replace("aße", "") | |
sv = sv.replace("asse", "") | |
street_value.append(sv) | |
self._add_token("-".join(street_value)) | |
self._add_token(self._house_number) | |
self._add_token(self._postal_code[:2]) | |
city = re.sub("(\\s?\\([^()]*\\))+", "", self._city) | |
city = re.split("[\\s.]+", city) | |
city = list(filter(lambda x: x != "", city)) | |
city_value = [] | |
for i in range(0, len(city)): | |
cv = city[i] | |
cv = cv.ljust(3, "_") | |
city_value.append(cv) | |
self._add_token("-".join(city_value)) | |
class PersonUnifierTest(unittest.TestCase): | |
def test_person_unifier(self): | |
self.assertEqual("male#mustermann#max#01_01_1990", str(PersonUnifier("male", "Mustermann", "Max", "01.01.1990"))) | |
self.assertEqual("male#mustermann-tester#max#01_01_1990", str(PersonUnifier("male", "Mustermann-Tester", "Max", "01.01.1990"))) | |
self.assertEqual("male#mustermann#m_x#01_01_1990", str(PersonUnifier("male", "mustermann", "mäx", "01.01.1990"))) | |
class AddressUnifierTest(unittest.TestCase): | |
def test_address_unifier(self): | |
self.assertEqual("heimweg#5#77#musterstadt", str(AddressUnifier("Heimweg", "5", "77777", "Musterstadt"))) | |
self.assertEqual("teststr#5#77#test", str(AddressUnifier("Teststraße", "5", "77777", "Test"))) | |
self.assertEqual("teststr#5#77#test", str(AddressUnifier("Teststr.", "5", "77777", "Test"))) | |
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test Straße", "5", "77777", "Test"))) | |
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test Str.", "5", "77777", "Test"))) | |
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Straße", "5", "77777", "Test"))) | |
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Str", "5", "77777", "Test"))) | |
self.assertEqual("tes-str#5#77#test", str(AddressUnifier("Test-Str.", "5", "77777", "Test"))) | |
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St.Tester Weg", "5C", "68390", "Musterstadt"))) | |
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St. Tester Weg", "5C", "68390", "Musterstadt"))) | |
self.assertEqual("st_-tes-weg#5c#68#musterstadt", str(AddressUnifier("St. Tester-Weg", "5C", "68390", "Musterstadt"))) | |
self.assertEqual("am_-testweg#5c#68#musterstadt", str(AddressUnifier("Am Testweg", "5C", "68390", "Musterstadt"))) | |
self.assertEqual("am_-testweg#5c#68#musterstadt", str(AddressUnifier("Am Testweg", "5C", "68390", "Musterstadt"))) | |
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unterer Testweg", "222", "75175", "Daheim"))) | |
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unt. Testweg", "222", "75175", "Daheim"))) | |
self.assertEqual("unt-testweg#222#75#daheim", str(AddressUnifier("Unt. Testweg", "222", "75175", "Daheim"))) | |
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("Unterer Test-Weg", "222", "75175", "Daheim"))) | |
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("Unt. Test-Weg", "222", "75175", "Daheim"))) | |
self.assertEqual("unt-tes-weg#222#75#daheim", str(AddressUnifier("unt test weg", "222", "75175", "Daheim"))) | |
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim (Test)"))) | |
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim ( Test)"))) | |
self.assertEqual("demohaus#10#88#daheim", str(AddressUnifier("Demohaus", "10", "88999", "Daheim (test.test)"))) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment