Last active
August 10, 2018 08:06
-
-
Save kuk/40344ddc6ef9a6807c349610c4a1e4ca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.ipynb_checkpoints/ | |
PullentiPython/ | |
news.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
sys.path.append('PullentiPython') | |
import json | |
from collections import namedtuple, Counter, defaultdict | |
from random import seed, sample | |
from ipymarkup import ( | |
Span as MarkupSpan, | |
AsciiMarkup, | |
LineMarkup, | |
LineLabelMarkup | |
) | |
from tqdm import tqdm as log_progress | |
def load_lines(path): | |
with open(path) as file: | |
for line in file: | |
yield line.rstrip('\n') | |
def format_json(data): | |
return json.dumps(data, indent=2, ensure_ascii=False) | |
def show_json(data): | |
print(format_json(data)) | |
########## | |
# | |
# UTILS | |
# | |
######### | |
from collections import OrderedDict | |
def assert_type(item, types): | |
if not isinstance(item, types): | |
if not isinstance(types, tuple): | |
types = [types] | |
raise TypeError('expected {types}, got {type}'.format( | |
types=' or '.join(_.__name__ for _ in types), | |
type=type(item).__name__ | |
)) | |
def assert_not_empty(item): | |
if len(item) == 0: | |
raise ValueError('expected not empty') | |
def assert_one_of(item, items): | |
if item not in items: | |
raise ValueError('{item!r} not in {items!r}'.format( | |
item=item, | |
items=items | |
)) | |
def jsonify(record): | |
data = OrderedDict() | |
for key in record.__attributes__: | |
value = getattr(record, key) | |
if isinstance(value, list): | |
value = [jsonify(_) for _ in value] | |
elif isinstance(value, Record): | |
value = value.as_json | |
data[key] = value | |
return data | |
class Record(object): | |
__attributes__ = [] | |
def __eq__(self, other): | |
return ( | |
type(self) == type(other) | |
and all( | |
(getattr(self, _) == getattr(other, _)) | |
for _ in self.__attributes__ | |
) | |
) | |
def __ne__(self, other): | |
return not self == other | |
def __iter__(self): | |
return (getattr(self, _) for _ in self.__attributes__) | |
def __hash__(self): | |
return hash(tuple(self)) | |
@property | |
def as_json(self): | |
return jsonify(self) | |
def __repr__(self): | |
name = self.__class__.__name__ | |
args = ', '.join( | |
'{key}={value!r}'.format( | |
key=_, | |
value=getattr(self, _) | |
) | |
for _ in self.__attributes__ | |
) | |
return '{name}({args})'.format( | |
name=name, | |
args=args | |
) | |
def _repr_pretty_(self, printer, cycle): | |
name = self.__class__.__name__ | |
if cycle: | |
printer.text('{name}(...)'.format(name=name)) | |
else: | |
printer.text('{name}('.format(name=name)) | |
keys = self.__attributes__ | |
size = len(keys) | |
if size: | |
with printer.indent(4): | |
printer.break_() | |
for index, key in enumerate(keys): | |
printer.text(key + '=') | |
value = getattr(self, key) | |
printer.pretty(value) | |
if index < size - 1: | |
printer.text(',') | |
printer.break_() | |
printer.break_() | |
printer.text(')') | |
############## | |
# | |
# LANG | |
# | |
########## | |
from pullenti.morph.MorphLang import MorphLang | |
from pullenti.morph.Morphology import Morphology | |
from pullenti.morph.Explanatory import Explanatory | |
RU = 'RU' | |
UA = 'UA' | |
BY = 'BY' | |
EN = 'EN' | |
IT = 'IT' | |
KZ = 'KZ' | |
LANGS = {RU, UA, BY, EN, IT, KZ} | |
DEFAULT_LANGS = {RU, EN} | |
def langs_to_raw(langs): | |
raw = MorphLang() | |
for lang in langs: | |
lang = getattr(MorphLang, lang) | |
raw |= lang | |
return raw | |
def raw_to_langs(raw): | |
langs = str(raw) # RU;EN | |
langs = ( | |
langs.split(';') | |
if langs | |
else [] | |
) | |
for lang in langs: | |
assert_one_of(lang, LANGS) | |
return set(langs) | |
def loaded_langs(): | |
raw = Morphology._get_loaded_languages() | |
return raw_to_langs(raw) | |
def unload_langs(langs): | |
raw = langs_to_raw(langs) | |
Morphology.unload_languages(raw) | |
Explanatory.unload_languages(raw) | |
def load_langs(langs): | |
raw = langs_to_raw(langs) | |
Morphology.load_languages(raw) | |
Explanatory.load_languages(raw) | |
def set_langs(langs): | |
langs = set(langs) | |
assert_not_empty(langs) | |
for lang in langs: | |
assert_one_of(lang, LANGS) | |
missing = loaded_langs() - langs | |
unload_langs(missing) | |
load_langs(langs) | |
########## | |
# | |
# PREPROCESS | |
# | |
############# | |
from pullenti.morph.internal.UnicodeInfo import UnicodeInfo | |
VALID = {_.uni_char for _ in UnicodeInfo.ALL_CHARS} | |
def preprocess(text): | |
return ''.join(_ for _ in text if _ in VALID) | |
########### | |
# | |
# PROCESSOR | |
# | |
########### | |
from pullenti.ner.Sdk import Sdk | |
from pullenti.ner.Processor import Processor as RawProcessor | |
from pullenti.ner.SourceOfAnalysis import SourceOfAnalysis | |
from pullenti.ner.ProcessorService import ProcessorService | |
from pullenti.ner.money.MoneyAnalyzer import MoneyAnalyzer | |
from pullenti.ner.date.DateAnalyzer import DateAnalyzer | |
from pullenti.ner.geo.GeoAnalyzer import GeoAnalyzer | |
from pullenti.ner._org.OrganizationAnalyzer import OrganizationAnalyzer | |
from pullenti.ner.person.PersonAnalyzer import PersonAnalyzer | |
PERSON = PersonAnalyzer.ANALYZER_NAME | |
ORGANIZATION = OrganizationAnalyzer.ANALYZER_NAME | |
GEO = GeoAnalyzer.ANALYZER_NAME | |
DATE = DateAnalyzer.ANALYZER_NAME | |
MONEY = MoneyAnalyzer.ANALYZER_NAME | |
ANALYZERS = { | |
PERSON, | |
ORGANIZATION, | |
GEO, | |
DATE, | |
MONEY | |
} | |
def select_analyzers(selected): | |
for analyzer in ProcessorService._get_analyzers(): | |
if analyzer.name in selected: | |
analyzer = analyzer.clone() | |
if analyzer is not None: # TODO why would it happen? | |
yield analyzer | |
class Processor(Record): | |
__attributes__ = ['analyzers'] | |
def __init__(self, analyzers): | |
for analyzer in analyzers: | |
assert_one_of(analyzer, ANALYZERS) | |
self.analyzers = analyzers | |
langs = loaded_langs() or DEFAULT_LANGS | |
raw = langs_to_raw(langs) | |
# TODO maybe cache inits | |
Sdk.initialize(raw) | |
self.raw = RawProcessor() | |
for analyzer in select_analyzers(self.analyzers): | |
self.raw.add_analyzer(analyzer) | |
def __call__(self, text): | |
sofa = SourceOfAnalysis(text) | |
raw = self.raw.process(sofa) | |
return convert_result(raw) | |
########### | |
# | |
# REFERENT | |
# | |
############# | |
from pullenti.ner.Referent import Referent as RawReferent | |
from pullenti.ner.person.PersonReferent import PersonReferent as RawPersonReferent | |
from pullenti.ner.person.PersonPropertyReferent import PersonPropertyReferent as RawPersonPropertyReferent | |
from pullenti.ner.person.PersonIdentityReferent import PersonIdentityReferent as RawPersonIdentityReferent | |
from pullenti.ner._org.OrganizationReferent import OrganizationReferent as RawOrganizationReferent | |
from pullenti.ner.geo.GeoReferent import GeoReferent as RawGeoReferent | |
from pullenti.ner.date.DateReferent import DateReferent as RawDateReferent | |
from pullenti.ner.date.DateRangeReferent import DateRangeReferent as RawDateRangeReferent | |
from pullenti.ner.money.MoneyReferent import MoneyReferent as RawMoneyReferent | |
from pullenti.ner.phone.PhoneReferent import PhoneReferent as RawPhoneReferent | |
class Slot(Record): | |
__attributes__ = ['key', 'value'] | |
def __init__(self, key, value): | |
self.key = key | |
self.value = value | |
class Referent(Record): | |
__attributes__ = ['label', 'slots'] | |
raw = None | |
def __init__(self, label, slots=()): | |
self.label = label | |
self.slots = slots | |
def slot_property(key): | |
@property | |
def get_first_slot(referent): | |
for slot in referent.slots: | |
if slot.key == key: | |
return slot.value | |
return get_first_slot | |
def raw_property(method): | |
@property | |
def get_raw_property(referent): | |
raw = referent.raw | |
return method.fget(raw) | |
return get_raw_property | |
class PersonReferent(Referent): | |
sex = slot_property(RawPersonReferent.ATTR_SEX) | |
indentity = slot_property(RawPersonReferent.ATTR_IDENTITY) | |
firstname = slot_property(RawPersonReferent.ATTR_FIRSTNAME) | |
middlename = slot_property(RawPersonReferent.ATTR_MIDDLENAME) | |
lastname = slot_property(RawPersonReferent.ATTR_LASTNAME) | |
nickname = slot_property(RawPersonReferent.ATTR_NICKNAME) | |
attribute = slot_property(RawPersonReferent.ATTR_ATTR) | |
age = raw_property(RawPersonReferent.age) | |
born = slot_property(RawPersonReferent.ATTR_BORN) | |
die = slot_property(RawPersonReferent.ATTR_DIE) | |
contact = slot_property(RawPersonReferent.ATTR_CONTACT) | |
iddoc = slot_property(RawPersonReferent.ATTR_IDDOC) | |
class PersonPropertyReferent(Referent): | |
name = raw_property(RawPersonPropertyReferent.name) | |
attribute = slot_property(RawPersonPropertyReferent.ATTR_ATTR) | |
ref = slot_property(RawPersonPropertyReferent.ATTR_REF) | |
higher = raw_property(RawPersonPropertyReferent.ATTR_HIGHER) | |
class PersonIdentityReferent(Referent): | |
type = raw_property(RawPersonIdentityReferent.typ) | |
number = raw_property(RawPersonIdentityReferent.number) | |
date = slot_property(RawPersonIdentityReferent.ATTR_DATE) | |
org = slot_property(RawPersonIdentityReferent.ATTR_ORG) | |
state = raw_property(RawPersonIdentityReferent.state) | |
address = raw_property(RawPersonIdentityReferent.address) | |
class OrganizationReferent(Referent): | |
type = slot_property(RawOrganizationReferent.ATTR_TYPE) | |
number = raw_property(RawOrganizationReferent.number) | |
eponym = slot_property(RawOrganizationReferent.ATTR_EPONYM) | |
higher = raw_property(RawOrganizationReferent.higher) | |
owner = raw_property(RawOrganizationReferent.owner) | |
geo = slot_property(RawOrganizationReferent.ATTR_GEO) | |
kladr = slot_property(RawOrganizationReferent.ATTR_KLADR) | |
misc = slot_property(RawOrganizationReferent.ATTR_MISC) | |
profile = slot_property(RawOrganizationReferent.ATTR_PROFILE) | |
inn = raw_property(RawOrganizationReferent.inn) | |
ogrn = raw_property(RawOrganizationReferent.ogrn) | |
names = raw_property(RawOrganizationReferent.names) | |
profiles = raw_property(RawOrganizationReferent.profiles) | |
types = raw_property(RawOrganizationReferent.types) | |
kind = raw_property(RawOrganizationReferent.kind) | |
class GeoReferent(Referent): | |
name = slot_property(RawGeoReferent.ATTR_NAME) | |
type = slot_property(RawGeoReferent.ATTR_TYPE) | |
alpha2 = slot_property(RawGeoReferent.ATTR_ALPHA2) | |
higher = slot_property(RawGeoReferent.ATTR_HIGHER) | |
ref = slot_property(RawGeoReferent.ATTR_REF) | |
fias = slot_property(RawGeoReferent.ATTR_FIAS) | |
bti = slot_property(RawGeoReferent.ATTR_BTI) | |
types = raw_property(RawGeoReferent.typs) | |
class DateReferent(Referent): | |
as_datetime = raw_property(RawDateReferent.dt) | |
century = raw_property(RawDateReferent.century) | |
year = raw_property(RawDateReferent.year) | |
month = raw_property(RawDateReferent.month) | |
day = raw_property(RawDateReferent.day) | |
day_of_week = raw_property(RawDateReferent.day_of_week) | |
hour = raw_property(RawDateReferent.hour) | |
minute = raw_property(RawDateReferent.minute) | |
second = raw_property(RawDateReferent.second) | |
higher = raw_property(RawDateReferent.higher) | |
pointer = raw_property(RawDateReferent.pointer) | |
class DateRangeReferent(Referent): | |
from_ = raw_property(RawDateRangeReferent.date_from) | |
to = raw_property(RawDateRangeReferent.date_to) | |
class MoneyReferent(Referent): | |
currency = raw_property(RawMoneyReferent.currency) | |
value = raw_property(RawMoneyReferent.value) | |
alt_value = raw_property(RawMoneyReferent.alt_value) | |
rest = raw_property(RawMoneyReferent.rest) | |
alt_rest = raw_property(RawMoneyReferent.alt_rest) | |
real_value = raw_property(RawMoneyReferent.real_value) | |
class PhoneReferent(Referent): | |
number = raw_property(RawPhoneReferent.number) | |
add_number = raw_property(RawPhoneReferent.add_number) | |
country_code = raw_property(RawPhoneReferent.country_code) | |
kind = raw_property(RawPhoneReferent.kind) | |
REFERENTS = { | |
RawPersonReferent: PersonReferent, | |
RawPersonPropertyReferent: PersonPropertyReferent, | |
RawPersonIdentityReferent: PersonIdentityReferent, | |
RawOrganizationReferent: OrganizationReferent, | |
RawGeoReferent: GeoReferent, | |
RawDateReferent: DateReferent, | |
RawDateRangeReferent: DateRangeReferent, | |
RawMoneyReferent: MoneyReferent, | |
RawPhoneReferent: PhoneReferent, | |
} | |
def convert_referent(raw): | |
Raw = type(raw) | |
Referent = REFERENTS.get(Raw) | |
if Referent: | |
referent = Referent(raw.type_name) | |
referent.raw = raw | |
return referent | |
raise TypeError('not supported type: {type}'.format( | |
type=type(raw) | |
)) | |
def convert_slots(raw, referents): | |
for slot in raw: | |
key = slot.type_name | |
value = slot.value | |
if isinstance(value, RawReferent): | |
value_id = id(value) | |
if value_id not in referents: | |
# TODO rare | |
continue | |
value = referents[value_id] | |
yield Slot(key, value) | |
def convert_referents(raws): | |
referents = {} | |
for raw in raws: | |
raw_id = id(raw) | |
if raw_id not in referents: | |
referent = convert_referent(raw) | |
referents[raw_id] = referent | |
for raw in raws: | |
slots = list(convert_slots(raw.slots, referents)) | |
referent = referents[id(raw)] | |
referent.slots = slots | |
return referents | |
############ | |
# | |
# RESULT | |
# | |
############ | |
from pullenti.ner.ReferentToken import ReferentToken | |
class Span(Record): | |
__attributes__ = ['start', 'stop'] | |
def __init__(self, start, stop): | |
self.start = start | |
self.stop = stop | |
class Match(Record): | |
__attributes__ = ['referent', 'span', 'children'] | |
def __init__(self, referent, span, children): | |
assert_type(referent, Referent) | |
self.referent = referent | |
assert_type(span, Span) | |
self.span = span | |
for child in children: | |
assert_type(child, Match) | |
self.children = children | |
def walk(self): | |
yield self | |
for child in self.children: | |
for item in child.walk(): | |
yield item | |
def get_match(token, referents): | |
referent = referents[id(token.referent)] | |
start = token.begin_token | |
stop = token.end_token | |
span = Span(start.begin_char, stop.end_char + 1) | |
children = list(get_matches(start, stop, referents)) | |
return Match(referent, span, children) | |
def get_matches(token, stop=None, referents=None): | |
while token: | |
if isinstance(token, ReferentToken): | |
yield get_match(token, referents) | |
if token == stop: | |
break | |
token = token.next0_ | |
def convert_result(raw): | |
referents = convert_referents(raw.entities) | |
matches = list(get_matches(raw.first_token, referents=referents)) | |
result = Result(matches) | |
result.raw = raw | |
return result | |
class Result(Record): | |
__attributes__ = ['matches'] | |
raw = None | |
def __init__(self, matches): | |
self.matches = matches | |
def walk(self): | |
for match in self.matches: | |
for item in match.walk(): | |
yield item | |
@property | |
def graph(self): | |
return graph_result(self) | |
############ | |
# | |
# GRAPH | |
# | |
########### | |
from subprocess import Popen, PIPE | |
BLUE = '#aec7e8' | |
ORANGE = '#ffbb78' | |
GREEN = '#dbdb8d' | |
RED = '#ff9896' | |
PURPLE = '#f7b6d2' | |
SILVER = '#eeeeee' | |
GRAY = 'gray' | |
DARKGRAY = '#888888' | |
def dot2svg(source): | |
process = Popen( | |
['dot', '-T', 'svg'], | |
stdin=PIPE, stdout=PIPE, stderr=PIPE | |
) | |
output, error = process.communicate(source.encode('utf8')) | |
if process.returncode != 0: | |
raise ValueError(error) | |
return output.decode('utf8') | |
class style(Record): | |
__attributes__ = ['attributes'] | |
def __init__(self, **attributes): | |
self.attributes = attributes | |
def quote(self, value): | |
value = str(value) | |
replace = { | |
'"': r'\"', | |
'\n': r'\n', | |
'\r': r'\r' | |
} | |
for a, b in replace.items(): | |
value = value.replace(a, b) | |
return '"' + value + '"' | |
def __str__(self): | |
return ', '.join( | |
'{key}={value}'.format( | |
key=key, | |
value=self.quote(value) | |
) | |
for key, value in self.attributes.items() | |
) | |
class Node(Record): | |
__attributes__ = ['item', 'style'] | |
def __init__(self, item, style): | |
self.item = item | |
self.style = style | |
def __hash__(self): | |
return id(self.item) | |
class Edge(Record): | |
__attributes__ = ['source', 'target', 'style'] | |
def __init__(self, source, target, style): | |
self.source = source | |
self.target = target | |
self.style = style | |
def __hash__(self): | |
return id(self.source) ^ id(self.target) | |
class Graph(Record): | |
__attributes__ = ['nodes', 'edges'] | |
graph_style = style( | |
margin=0, | |
nodesep=0, | |
ranksep=0, | |
splines='splines', | |
layout='neato', | |
overlap='compress', | |
) | |
node_style = style( | |
shape='box', | |
height=0, | |
width=0, | |
fontname='sans', | |
fontsize=10, | |
color='none', | |
style='filled', | |
fillcolor=SILVER | |
) | |
edge_style = style( | |
fontname='sans', | |
fontsize=8, | |
fontcolor=GRAY, | |
arrowsize=0.3, | |
color=GRAY | |
) | |
def __init__(self): | |
self.nodes = set() | |
self.edges = set() | |
self.ids = {} | |
def add_node(self, item, style=None): | |
node = Node(item, style) | |
self.nodes.add(node) | |
def add_edge(self, source, target, style=None): | |
edge = Edge(source, target, style) | |
self.edges.add(edge) | |
def id(self, item): | |
item_id = id(item) | |
if item_id not in self.ids: | |
self.ids[item_id] = len(self.ids) | |
return self.ids[item_id] | |
@property | |
def source(self): | |
yield 'digraph G {' | |
yield 'graph [{graph_style}];'.format(graph_style=str(self.graph_style)) | |
yield 'node [{node_style}];'.format(node_style=str(self.node_style)) | |
yield 'edge [{edge_style}];'.format(edge_style=str(self.edge_style)) | |
for node in self.nodes: | |
pattern = ( | |
'{index} [{style}];' | |
if node.style | |
else '{index}' | |
) | |
yield pattern.format( | |
index=self.id(node.item), | |
style=str(node.style) | |
) | |
for edge in self.edges: | |
pattern = ( | |
'{source} -> {target} [{style}];' | |
if edge.style | |
else '{source} -> {target};' | |
) | |
yield pattern.format( | |
source=self.id(edge.source), | |
target=self.id(edge.target), | |
style=str(edge.style) | |
) | |
yield '}' | |
def _repr_svg_(self): | |
return dot2svg('\n'.join(self.source)) | |
def graph_result(result): | |
graph = Graph() | |
for match in result.walk(): | |
source = match.referent | |
for key, target in source.slots: | |
graph.add_edge( | |
source, | |
target, | |
style( | |
label=key | |
) | |
) | |
graph.add_node( | |
source, | |
style( | |
label=source.label, | |
fillcolor=BLUE | |
) | |
) | |
if isinstance(target, Referent): | |
color = BLUE | |
label = target.label | |
else: | |
color = SILVER | |
label = target | |
graph.add_node( | |
target, | |
style( | |
label=label, | |
fillcolor=color | |
) | |
) | |
return graph |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment