Created
October 4, 2024 08:05
-
-
Save Podbrushkin/43053bf16640afce96f01721e2f71d6a to your computer and use it in GitHub Desktop.
Wikidata Reconciliation Powershell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function Convert-PersonToQuery ([pscustomobject]$Person, [switch]$PreciseDate = $false) { | |
$properties = @() | |
$birthDates = @()+$person.birthdate+$person.birthdates | ? {$_} | % {[string]$_} | |
foreach ($bd in $birthDates) { | |
if ($bd.length -eq 10 -and $PreciseDate) { | |
$properties += @{pid = 'P569'; v = $bd}; | |
} else { | |
$properties += @{pid = 'P569@year'; v = $bd.substring(0,4)}; | |
} | |
} | |
$deathDates = @()+$person.deathdate+$person.deathdates | ? {$_} | % {[string]$_} | |
foreach ($dd in $deathDates) { | |
if ($dd.length -eq 10 -and $PreciseDate) { | |
$properties += @{pid = 'P570'; v = $dd}; | |
} else { | |
$properties += @{pid = 'P570@year'; v = $dd.substring(0,4)}; | |
} | |
} | |
$query = @{ | |
query = $person.fullname; | |
type = 'Q5'; | |
limit = 2 | |
}; | |
if ($properties.count -gt 0) { | |
$query['properties'] = $properties | |
} | |
return $query | |
} | |
<# | |
.EXAMPLE | |
$krest = [pscustomobject]@{fullname='Крестовский Всеволод'; birthdate=1840} | |
#> | |
function Add-ReconciliationData { | |
[CmdletBinding(SupportsShouldProcess)] | |
param( | |
[PSCustomObject[]]$People, | |
[switch]$PreciseDate = $false | |
) | |
Write-Host "count:" $People.count | |
$queriesMap = @{} | |
for ($i = 0; $i -lt $People.count; $i++) { | |
$person = $People[$i] | |
$query = Convert-PersonToQuery $person -PreciseDate:$PreciseDate | |
$queriesMap["q$i"] = $query; | |
} | |
$queriesCompressed = $queriesMap | ConvertTo-Json -Depth 99 -Compress | |
if ($PSCmdlet.ShouldProcess("Reconciliation service", ($queriesMap | ConvertTo-Json -Depth 99))) { | |
try { | |
$url = 'https://wikidata.reconci.link/ru/api' | |
$resp = Invoke-RestMethod $url -Method Post -Body "queries=$queriesCompressed" | |
} catch { | |
Write-Warning "Add-ReconcileData: Failed with $($People.count) objects." | |
#$People | |
throw | |
} | |
Write-Information "Add-ReconciliationData: Got data for $($People.count) objects." | |
for ($i = 0; $i -lt $People.count; $i++) { | |
if ($People[$i].reconData -ne $null -or ($resp."q$i" -eq $null)) {continue;} | |
Add-Member -InputObject $People[$i] ` | |
-NotePropertyName reconData ` | |
-NotePropertyValue $resp."q$i".result | |
} | |
#$People | |
} | |
} | |
function Add-ReconciliationDataRecursive ([PSCustomObject[]]$People,[switch]$PreciseDate) { | |
# 47 seconds for 100 objs with $groupSize = 50 | |
# 42 seconds for 100 objs with $groupSize = 25 , 42for25, 100sec/300objs/50gr, 61sec/400objs/50gr | |
# 206sec/1000objs/50gr, 613sec/4000objs/50gr, 46sec/283objs/100gr | |
$groupSize = 25 | |
$script:counter = 0 | |
if ($People.count -gt $groupSize) { | |
$groups = $People | Group-Object -Property { [math]::Floor($script:counter++ / $groupSize) } | |
$groups | ForEach-Object {Add-ReconciliationDataRecursive $_.Group -PreciseDate:$PreciseDate} | |
return | |
} | |
try { | |
Add-ReconciliationData -People $People -PreciseDate:$PreciseDate | |
# Write-Information "Added data to $($People.count) objects." | |
} catch { | |
Write-Warning "One of $($People.count) objects provoked error." | |
if ($People.count -lt 2) { | |
if ($null -eq $global:FailedReconObjs) { | |
$global:FailedReconObjs = @() | |
} | |
$global:FailedReconObjs += $People; | |
Write-Warning "Found corrupted object: $($People.fullName), `$FailedReconObjs.count=$($global:FailedReconObjs.count)." | |
return; | |
} | |
$middle = [int](($People.count-1) / 2) | |
Add-ReconciliationDataRecursive $People[0..$middle] -PreciseDate:$PreciseDate | |
Add-ReconciliationDataRecursive $People[($middle+1)..($People.count-1)] -PreciseDate:$PreciseDate | |
} | |
} | |
$ppl = Get-Item *.csv | select -f 1 | Get-Content -Encoding UTF8 | ConvertFrom-Csv | |
Add-ReconciliationDataRecursive $ppl | |
$ppl | ? {$_.reconData[0].score -eq 100 -and $_.reconData[0].match -eq $true} | % { | |
Add-Member -inp $_ -NotePropertyName qid -NotePropertyValue $_.reconData[0].id | |
Add-Member -inp $_ -NotePropertyName desc -NotePropertyValue $_.reconData[0].description | |
} | |
$propNames = $ppl | % {$_.psobject.Properties.name} | select -Unique | |
$ppl | select $propNames -excl reconData | ConvertTo-Csv -UseQuotes AsNeeded > reconciled.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment