Last active
January 24, 2018 11:16
-
-
Save erickguan/7995fac077e48600debfac92a293c345 to your computer and use it in GitHub Desktop.
Import parts of CBDB into wikidata.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% | |
from datetime import date | |
from sqlalchemy import create_engine | |
import pywikibot | |
from pprint import pprint | |
from operator import is_not | |
from functools import partial | |
from concurrent.futures import ThreadPoolExecutor | |
from SPARQLWrapper import SPARQLWrapper, JSON | |
import sys | |
sparql = SPARQLWrapper("https://query.wikidata.org/sparql") | |
sparql.setQuery('''PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
PREFIX wikibase: <http://wikiba.se/ontology#> | |
PREFIX p: <http://www.wikidata.org/prop/> | |
PREFIX ps: <http://www.wikidata.org/prop/statement/> | |
PREFIX pq: <http://www.wikidata.org/prop/qualifier/> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX bd: <http://www.bigdata.com/rdf#> | |
SELECT ?item ?value WHERE { | |
?item wdt:P497 ?value | |
}''') | |
sparql.setReturnFormat(JSON) | |
json = sparql.query().convert() | |
cdbd_ids = dict(map(lambda x: (x['value']['value'], x['item']['value']), json['results']['bindings'])) | |
pprint(len(cdbd_ids)) | |
STATUS_CODE_TO_PROPERTY = { | |
'2': 'Q11545923', | |
'3': 'Q1294787', | |
'4': 'Q11063', | |
'7': 'Q854979', | |
'8': 'Q854997', | |
'9': 'Q3303330', | |
'26': 'Q39018', | |
'27': 'Q7723211', | |
'29': 'Q179294', | |
'31': 'Q45352519', | |
'33': 'Q131512', | |
'35': 'Q220098', | |
'36': 'Q844586', | |
'37': 'Q15954519', | |
'43': 'Q903422', | |
'48': 'Q1097498', | |
'56': 'Q1062083', | |
'57': 'Q170790', | |
'59': 'Q215536', | |
'60': 'Q1688932', | |
'65': 'Q16744001', | |
'71': 'Q1028181', | |
'72': 'Q39631', | |
'74': 'Q13219330', | |
'75': 'Q1125062', | |
'76': 'Q2303143', | |
'86': 'Q12773225', | |
'89': 'Q48282', | |
'94': 'Q37226', | |
'114': 'Q49757', | |
'142': 'Q201788', | |
'143': 'Q1126160', | |
'165': 'Q36180', | |
'166': 'Q1437754', | |
'180': 'Q20826540', | |
'182': 'Q14467526', | |
'184': 'Q3243461', | |
'235': 'Q45353837', | |
'236': 'Q45353897' | |
} | |
# LIMIT = (33236, 366589) #366589 | |
LIMIT = (int(sys.argv[1]), int(sys.argv[2])) | |
eng = create_engine('sqlite:///20170424CBDBauUserSqlite.db') | |
conn = eng.connect() | |
biog = conn.execute(f'''SELECT * FROM BIOG_MAIN WHERE c_personid >= {LIMIT[0]} AND c_personid < {LIMIT[1]} ORDER BY c_personid ASC;''') | |
s = pywikibot.Site('wikidata', 'wikidata') | |
executor = ThreadPoolExecutor(max_workers=2) | |
def create_item(row, eng): | |
conn = eng.connect() | |
c_personid = row['c_personid'] | |
print(c_personid) | |
eng_name = row['c_name'] | |
cht_name = row['c_name_chn'] | |
if len(set('()氏妻') & set(cht_name)) == 0: | |
pass | |
aliases_eng = [] + [row['c_mingzi_rm'], row['c_name_rm']] | |
aliases_cht = [] | |
for row in conn.execute(f'''SELECT * FROM ALTNAME_DATA WHERE c_personid={c_personid};'''): | |
alt_eng = row['c_alt_name'] | |
alt_cht = row['c_alt_name_chn'] | |
aliases_eng += [alt_eng] | |
aliases_cht += [alt_cht] | |
aliases_eng = list(filter(partial(is_not, None), aliases_eng)) | |
aliases_cht = list(filter(partial(is_not, None), aliases_cht)) | |
if eng_name is None and len(aliases_eng) > 0: | |
eng_name = aliases_eng.pop() | |
if cht_name is None and len(aliases_cht) > 0: | |
cht_name = aliases_cht.pop() | |
rjust_personid = str(c_personid).rjust(7, '0') | |
if cdbd_ids.get(rjust_personid) is None: | |
item = pywikibot.ItemPage(s) | |
data = {'aliases': {}, | |
'labels': {}, | |
'claims': {}} | |
label_eng = None | |
label_cht = None | |
ali_eng = None | |
ali_cht = None | |
else: | |
item = pywikibot.ItemPage.from_entity_uri(s, cdbd_ids[rjust_personid]) | |
data = item.get() | |
data = { | |
'aliases': data['aliases'], | |
'labels': data['labels'], | |
'claims': data['claims'] | |
} | |
label_eng = item.labels.get('en') | |
label_cht = item.labels.get('zh-hant') or item.labels.get('zh') or item.labels.get('zh-hans') | |
ali_eng = item.aliases.get('en') | |
ali_cht = item.aliases.get('zh-hant') or item.aliases.get('zh') or item.aliases.get('zh-hans') | |
if eng_name is not None: | |
if label_eng is None: | |
data['labels']['en'] = eng_name | |
else: | |
aliases_eng.insert(0, eng_name) | |
if cht_name is not None: | |
if label_cht is None: | |
data['labels']['zh-hant'] = cht_name | |
else: | |
aliases_cht.insert(0, cht_name) | |
if data['aliases'].get('en') is None: | |
data['aliases']['en'] = [] | |
if data['aliases'].get('zh-hant') is None: | |
data['aliases']['zh-hant'] = [] | |
data['aliases']['en'] = list(set(data['aliases']['en']) | set(aliases_eng)) | |
data['aliases']['zh-hant'] = list(set(data['aliases']['zh-hant']) | set(aliases_cht)) | |
if len(data['aliases']['en']) == 0: | |
del data['aliases']['en'] | |
if len(data['aliases']['zh-hant']) == 0: | |
del data['aliases']['zh-hant'] | |
pprint(data) | |
claim_cbdb = data['claims'].get('P497') | |
if claim_cbdb is not None: | |
del data['claims'] | |
item.editEntity(data, summary="Imported from CDBD [[Wikidata:Data_Import_Hub#CBDB]]") | |
else: | |
item.editEntity(data, summary="Imported from CDBD [[Wikidata:Data_Import_Hub#CBDB]]") | |
claim = pywikibot.Claim(s, 'P497') | |
claim.setTarget(rjust_personid) | |
item.addClaim(claim) | |
human_claim = pywikibot.Claim(s, 'P31') | |
human_claim.setTarget(pywikibot.ItemPage(s, 'Q5')) | |
item.addClaim(human_claim) | |
for row in conn.execute(f'''SELECT * FROM STATUS_CODES c, STATUS_DATA d WHERE c.c_status_code = d.c_status_code AND d.c_status_code>0 AND d.c_personid={c_personid};'''): | |
key = str(int(row['c_status_code'])) | |
if STATUS_CODE_TO_PROPERTY.get(key) is None: | |
continue | |
occu_claim = pywikibot.Claim(s, 'P106') | |
occu_claim.setTarget(pywikibot.ItemPage(s, STATUS_CODE_TO_PROPERTY[key])) | |
item.addClaim(occu_claim) | |
for row in biog: | |
future = executor.submit(create_item, row, eng) | |
print(future.result()) | |
#%% | |
import pywikibot | |
from pprint import pprint | |
s = pywikibot.Site('wikidata', 'wikidata') | |
pprint(pywikibot.ItemPage(s, 'Q720').get()['claims']['P497'][0]) | |
# Claim.fromJSON(DataSite("wikidata", "wikidata"), {'mainsnak': {'snaktype': 'value', 'property': 'P497', 'datatype': 'external-id', 'datavalue': {'value': '0029239', 'type': 'string'}}, 'type': 'statement', 'id': 'Q720$4C93CED5-F403-45DD-B270-AA9D6AD76AB7', 'rank': 'normal'}) | |
claim_p = pywikibot.Claim(s, 'P497') | |
claim_p.setTarget('1111111') | |
pprint(claim_p) | |
#%% | |
import pywikibot | |
from pprint import pprint | |
s = pywikibot.Site('test', 'wikidata') | |
pprint(pywikibot.ItemPage(s, 'Q110490').get()) | |
#%% | |
import pywikibot | |
from pprint import pprint | |
s = pywikibot.Site('wikidata', 'wikidata') | |
pprint(pywikibot.ItemPage.from_entity_uri(s, 'http://www.wikidata.org/entity/Q11109043').get()) | |
#%% | |
import pywikibot | |
from pprint import pprint | |
s = pywikibot.Site('test', 'wikidata') | |
p = pywikibot.ItemPage(s) | |
p.editEntity({'labels': {'en': 'test2'}}) | |
pprint(p.get()) | |
#%% | |
import pywikibot | |
from pprint import pprint | |
s = pywikibot.Site('wikidata', 'wikidata') | |
pprint(pywikibot.PropertyPage(s, 'P497').get()) | |
claim_p = pywikibot.Claim(s, 'P497') | |
claim_p.setTarget('1111111') | |
pprint(claim_p) | |
#%% | |
a = pywikibot.data.api.Request() | |
print(a) | |
#%% | |
from SPARQLWrapper import SPARQLWrapper, JSON | |
import pickle | |
sparql = SPARQLWrapper("https://query.wikidata.org/sparql") | |
sparql.setQuery('''PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
PREFIX wikibase: <http://wikiba.se/ontology#> | |
PREFIX p: <http://www.wikidata.org/prop/> | |
PREFIX ps: <http://www.wikidata.org/prop/statement/> | |
PREFIX pq: <http://www.wikidata.org/prop/qualifier/> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX bd: <http://www.bigdata.com/rdf#> | |
SELECT ?item ?value WHERE { | |
?item wdt:P497 ?value | |
}''') | |
sparql.setReturnFormat(JSON) | |
json = sparql.query().convert() | |
cdbd_ids = dict(map(lambda x: (x['value']['value'], x['item']['value']), json['results']['bindings'])) | |
with open('dump_cdbd_mapping', 'wb') as f: | |
pickle.dump(cdbd_ids, f) | |
#%% | |
import pickle | |
with open('dump_cdbd_mapping', 'rb') as f: | |
cdbd_ids = pickle.load(f) | |
print(cdbd_ids) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment