-
-
Save mauro-balades/3768192630801f3b5c19b29389c7907b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Not the nicest code we ever wrote... */ | |
<?php | |
require_once 'HTTP/Request.php'; | |
require_once 'Text/Wiki/Mediawiki.php'; | |
class Wikipedia | |
{ | |
private function zapBetween ($string, $start, $end) | |
{ | |
$nesting = 0; | |
$result = ""; | |
$index = 0; | |
$len = strlen ($string); | |
while ($index < $len) { | |
if (strpos ($string, $start, $index) === $index) { | |
$nesting++; | |
$index += strlen ($start); | |
} else if (strpos ($string, $end, $index) === $index) { | |
$nesting--; | |
$index += strlen ($end); | |
} else { | |
if ($nesting == 0) { | |
$result = $result . substr ($string, $index, 1); | |
} | |
$index++; | |
} | |
} | |
return $result; | |
} | |
private function extractImage ($wikitext) | |
{ | |
$image = array (); | |
preg_match ("/\n\s?\|\s?image\s?=\s? (.*?)\n/", $wikitext, $image); | |
$image = $image[1]; | |
if ($image) { | |
$imgurl = "http://en.wikipedia.org/w/api.php" . | |
'?prop=imageinfo&action=query&iiprop=url&iiurlwidth=150&format=php' . | |
'&titles=Image:' . str_replace (' ', '_', $image); | |
$client = new HTTP_Request (); | |
$client->setMethod (HTTP_REQUEST_METHOD_GET); | |
$client->setURL ($imgurl); | |
$response = $client->sendRequest (); | |
if ($response = $client->getResponseBody ()) { | |
if ($imageinfo = unserialize ($response)) { | |
// Hack for wikipedia api | |
preg_match ('/\"http:\/\/(.*)\"/', $response, $matches); | |
$image = 'http://' . substr ($matches[1], | |
0, | |
strpos ($matches[1], '"')); | |
return $image; | |
} | |
} | |
} | |
return false; | |
} | |
private function cleanWikiText ($wikitext) | |
{ | |
$wikitext = $this->zapBetween ($wikitext, "{{", "}}"); | |
$wikitext = preg_replace ("/\[\[Image:.*?\]\]\n/", "", $wikitext); | |
if (!preg_match ('/disambiguation/', $result['name'])) { | |
$wikitext = preg_replace ('/\=\=.*?$/s', '', $wikitext); | |
} | |
return $wikitext; | |
} | |
private function wikipediaBio ($lookfor, $author) | |
{ | |
$wiki =& new Text_Wiki_Mediawiki (); | |
$wiki->setRenderConf ('xhtml', 'wikilink', 'pages', false); | |
$wiki->setFormatConf ('Xhtml', 'charset', 'utf-8'); | |
$wiki->setFormatConf ('Xhtml', 'translate', false); | |
$wiki->setRenderConf ('xhtml', 'image', 'base', | |
'http://en.wikipedia.org/wiki/Image:'); | |
$wiki->setRenderConf ('xhtml', 'wikilink', 'view_url', | |
'http://en.wikipedia.org/wiki/%s'); | |
$wiki->disableRule ('image'); | |
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&redirects=1&titles=' . $lookfor; | |
$requestParams = array ('timeout' => 2, 'readTimeout' => array ('4','0')); | |
$client = new HTTP_Request ('', $requestParams); | |
$client->setMethod (HTTP_REQUEST_METHOD_GET); | |
$client->setURL ($url); | |
PEAR::setErrorHandling (PEAR_ERROR_RETURN); | |
$response = $client->sendRequest (); | |
if (!PEAR::isError ($response)) { | |
$result = array(); | |
$body = @unserialize ($client->getResponseBody ()); | |
//Check if data exists or not | |
if ($body && | |
!isset($body['error']) && | |
!isset($body['query']['pages']['-1'])) { | |
$body = array_shift ($body['query']['pages']); | |
$result['name'] = $body['title']; | |
$body = array_shift ($body['revisions']); | |
$wikitext = $body['*']; | |
$image_url = $this->extractImage ($wikitext); | |
if ($image_url) { | |
$result['image'] = $image_url; | |
} | |
$wikitext = $this->cleanWikiText ($wikitext); | |
$result['description'] = $wiki->transform ($wikitext, 'xhtml'); | |
/* If we have life dates, extract them now... */ | |
$year = false; | |
if (preg_match ("/[0-9]{4}/", array_pop ($author), | |
$matches)) { | |
if (count ($matches) > 0) { | |
$year = $matches[0]; | |
} | |
} | |
$result['provider'] = "Wikipedia"; | |
$result['provider_link'] = "http://www.wikipedia.org/wiki/" . $result['name']; | |
if (preg_match ('/disambiguation/', $result['name']) || | |
preg_match ('/^[^\.]+(author|writer|journalist|novelist)[^\.]*\./', | |
$wikitext) || | |
($year && preg_match ("/$year/", $wikitext))) { | |
return $result; | |
} | |
return false; | |
} | |
} | |
return false; | |
} | |
private function findBiography ($name) | |
{ | |
global $configArray; | |
$author = preg_replace ("/,$/", "", $name); | |
/* Remove any parenthetical remarks from the author's name */ | |
$author = $this->zapBetween ($author, "(", ")"); | |
$author = explode (',', $author); | |
// Look for a disambiguation page | |
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0] (disambiguation)"), | |
$author); | |
if ($wikiInfo) { | |
return false; | |
} | |
// Look for an author page | |
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0]"), | |
$author); | |
if ($wikiInfo) { | |
return $wikiInfo; | |
} | |
// Give up! | |
return false; | |
} | |
function biographyFor ($name) | |
{ | |
$result = $this->findBiography ($name); | |
if ($result) { | |
$result['snippet'] = (preg_replace ("/<\/p>.*/s", "</p>", | |
$result['description'])); | |
$result['hasmore'] = | |
!preg_match ('/^\s+$/', | |
substr ($result['description'], | |
strlen ($result['snippet']))); | |
} | |
return $result; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment