Skip to content

Instantly share code, notes, and snippets.

@Podbrushkin
Created October 4, 2024 08:05
Show Gist options
  • Save Podbrushkin/43053bf16640afce96f01721e2f71d6a to your computer and use it in GitHub Desktop.
Save Podbrushkin/43053bf16640afce96f01721e2f71d6a to your computer and use it in GitHub Desktop.
Wikidata Reconciliation Powershell
function Convert-PersonToQuery ([pscustomobject]$Person, [switch]$PreciseDate = $false) {
$properties = @()
$birthDates = @()+$person.birthdate+$person.birthdates | ? {$_} | % {[string]$_}
foreach ($bd in $birthDates) {
if ($bd.length -eq 10 -and $PreciseDate) {
$properties += @{pid = 'P569'; v = $bd};
} else {
$properties += @{pid = 'P569@year'; v = $bd.substring(0,4)};
}
}
$deathDates = @()+$person.deathdate+$person.deathdates | ? {$_} | % {[string]$_}
foreach ($dd in $deathDates) {
if ($dd.length -eq 10 -and $PreciseDate) {
$properties += @{pid = 'P570'; v = $dd};
} else {
$properties += @{pid = 'P570@year'; v = $dd.substring(0,4)};
}
}
$query = @{
query = $person.fullname;
type = 'Q5';
limit = 2
};
if ($properties.count -gt 0) {
$query['properties'] = $properties
}
return $query
}
<#
.EXAMPLE
$krest = [pscustomobject]@{fullname='Крестовский Всеволод'; birthdate=1840}
#>
function Add-ReconciliationData {
[CmdletBinding(SupportsShouldProcess)]
param(
[PSCustomObject[]]$People,
[switch]$PreciseDate = $false
)
Write-Host "count:" $People.count
$queriesMap = @{}
for ($i = 0; $i -lt $People.count; $i++) {
$person = $People[$i]
$query = Convert-PersonToQuery $person -PreciseDate:$PreciseDate
$queriesMap["q$i"] = $query;
}
$queriesCompressed = $queriesMap | ConvertTo-Json -Depth 99 -Compress
if ($PSCmdlet.ShouldProcess("Reconciliation service", ($queriesMap | ConvertTo-Json -Depth 99))) {
try {
$url = 'https://wikidata.reconci.link/ru/api'
$resp = Invoke-RestMethod $url -Method Post -Body "queries=$queriesCompressed"
} catch {
Write-Warning "Add-ReconcileData: Failed with $($People.count) objects."
#$People
throw
}
Write-Information "Add-ReconciliationData: Got data for $($People.count) objects."
for ($i = 0; $i -lt $People.count; $i++) {
if ($People[$i].reconData -ne $null -or ($resp."q$i" -eq $null)) {continue;}
Add-Member -InputObject $People[$i] `
-NotePropertyName reconData `
-NotePropertyValue $resp."q$i".result
}
#$People
}
}
function Add-ReconciliationDataRecursive ([PSCustomObject[]]$People,[switch]$PreciseDate) {
# 47 seconds for 100 objs with $groupSize = 50
# 42 seconds for 100 objs with $groupSize = 25 , 42for25, 100sec/300objs/50gr, 61sec/400objs/50gr
# 206sec/1000objs/50gr, 613sec/4000objs/50gr, 46sec/283objs/100gr
$groupSize = 25
$script:counter = 0
if ($People.count -gt $groupSize) {
$groups = $People | Group-Object -Property { [math]::Floor($script:counter++ / $groupSize) }
$groups | ForEach-Object {Add-ReconciliationDataRecursive $_.Group -PreciseDate:$PreciseDate}
return
}
try {
Add-ReconciliationData -People $People -PreciseDate:$PreciseDate
# Write-Information "Added data to $($People.count) objects."
} catch {
Write-Warning "One of $($People.count) objects provoked error."
if ($People.count -lt 2) {
if ($null -eq $global:FailedReconObjs) {
$global:FailedReconObjs = @()
}
$global:FailedReconObjs += $People;
Write-Warning "Found corrupted object: $($People.fullName), `$FailedReconObjs.count=$($global:FailedReconObjs.count)."
return;
}
$middle = [int](($People.count-1) / 2)
Add-ReconciliationDataRecursive $People[0..$middle] -PreciseDate:$PreciseDate
Add-ReconciliationDataRecursive $People[($middle+1)..($People.count-1)] -PreciseDate:$PreciseDate
}
}
$ppl = Get-Item *.csv | select -f 1 | Get-Content -Encoding UTF8 | ConvertFrom-Csv
Add-ReconciliationDataRecursive $ppl
$ppl | ? {$_.reconData[0].score -eq 100 -and $_.reconData[0].match -eq $true} | % {
Add-Member -inp $_ -NotePropertyName qid -NotePropertyValue $_.reconData[0].id
Add-Member -inp $_ -NotePropertyName desc -NotePropertyValue $_.reconData[0].description
}
$propNames = $ppl | % {$_.psobject.Properties.name} | select -Unique
$ppl | select $propNames -excl reconData | ConvertTo-Csv -UseQuotes AsNeeded > reconciled.csv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment