Created
September 21, 2010 23:47
-
-
Save marktriggs/590822 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Not the nicest code we ever wrote... */ | |
<?php | |
require_once 'HTTP/Request.php'; | |
require_once 'Text/Wiki/Mediawiki.php'; | |
class Wikipedia | |
{ | |
private function zapBetween ($string, $start, $end) | |
{ | |
$nesting = 0; | |
$result = ""; | |
$index = 0; | |
$len = strlen ($string); | |
while ($index < $len) { | |
if (strpos ($string, $start, $index) === $index) { | |
$nesting++; | |
$index += strlen ($start); | |
} else if (strpos ($string, $end, $index) === $index) { | |
$nesting--; | |
$index += strlen ($end); | |
} else { | |
if ($nesting == 0) { | |
$result = $result . substr ($string, $index, 1); | |
} | |
$index++; | |
} | |
} | |
return $result; | |
} | |
private function extractImage ($wikitext) | |
{ | |
$image = array (); | |
preg_match ("/\n\s?\|\s?image\s?=\s? (.*?)\n/", $wikitext, $image); | |
$image = $image[1]; | |
if ($image) { | |
$imgurl = "http://en.wikipedia.org/w/api.php" . | |
'?prop=imageinfo&action=query&iiprop=url&iiurlwidth=150&format=php' . | |
'&titles=Image:' . str_replace (' ', '_', $image); | |
$client = new HTTP_Request (); | |
$client->setMethod (HTTP_REQUEST_METHOD_GET); | |
$client->setURL ($imgurl); | |
$response = $client->sendRequest (); | |
if ($response = $client->getResponseBody ()) { | |
if ($imageinfo = unserialize ($response)) { | |
// Hack for wikipedia api | |
preg_match ('/\"http:\/\/(.*)\"/', $response, $matches); | |
$image = 'http://' . substr ($matches[1], | |
0, | |
strpos ($matches[1], '"')); | |
return $image; | |
} | |
} | |
} | |
return false; | |
} | |
private function cleanWikiText ($wikitext) | |
{ | |
$wikitext = $this->zapBetween ($wikitext, "{{", "}}"); | |
$wikitext = preg_replace ("/\[\[Image:.*?\]\]\n/", "", $wikitext); | |
if (!preg_match ('/disambiguation/', $result['name'])) { | |
$wikitext = preg_replace ('/\=\=.*?$/s', '', $wikitext); | |
} | |
return $wikitext; | |
} | |
private function wikipediaBio ($lookfor, $author) | |
{ | |
$wiki =& new Text_Wiki_Mediawiki (); | |
$wiki->setRenderConf ('xhtml', 'wikilink', 'pages', false); | |
$wiki->setFormatConf ('Xhtml', 'charset', 'utf-8'); | |
$wiki->setFormatConf ('Xhtml', 'translate', false); | |
$wiki->setRenderConf ('xhtml', 'image', 'base', | |
'http://en.wikipedia.org/wiki/Image:'); | |
$wiki->setRenderConf ('xhtml', 'wikilink', 'view_url', | |
'http://en.wikipedia.org/wiki/%s'); | |
$wiki->disableRule ('image'); | |
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&redirects=1&titles=' . $lookfor; | |
$requestParams = array ('timeout' => 2, 'readTimeout' => array ('4','0')); | |
$client = new HTTP_Request ('', $requestParams); | |
$client->setMethod (HTTP_REQUEST_METHOD_GET); | |
$client->setURL ($url); | |
PEAR::setErrorHandling (PEAR_ERROR_RETURN); | |
$response = $client->sendRequest (); | |
if (!PEAR::isError ($response)) { | |
$result = array(); | |
$body = @unserialize ($client->getResponseBody ()); | |
//Check if data exists or not | |
if ($body && | |
!isset($body['error']) && | |
!isset($body['query']['pages']['-1'])) { | |
$body = array_shift ($body['query']['pages']); | |
$result['name'] = $body['title']; | |
$body = array_shift ($body['revisions']); | |
$wikitext = $body['*']; | |
$image_url = $this->extractImage ($wikitext); | |
if ($image_url) { | |
$result['image'] = $image_url; | |
} | |
$wikitext = $this->cleanWikiText ($wikitext); | |
$result['description'] = $wiki->transform ($wikitext, 'xhtml'); | |
/* If we have life dates, extract them now... */ | |
$year = false; | |
if (preg_match ("/[0-9]{4}/", array_pop ($author), | |
$matches)) { | |
if (count ($matches) > 0) { | |
$year = $matches[0]; | |
} | |
} | |
$result['provider'] = "Wikipedia"; | |
$result['provider_link'] = "http://www.wikipedia.org/wiki/" . $result['name']; | |
if (preg_match ('/disambiguation/', $result['name']) || | |
preg_match ('/^[^\.]+(author|writer|journalist|novelist)[^\.]*\./', | |
$wikitext) || | |
($year && preg_match ("/$year/", $wikitext))) { | |
return $result; | |
} | |
return false; | |
} | |
} | |
return false; | |
} | |
private function findBiography ($name) | |
{ | |
global $configArray; | |
$author = preg_replace ("/,$/", "", $name); | |
/* Remove any parenthetical remarks from the author's name */ | |
$author = $this->zapBetween ($author, "(", ")"); | |
$author = explode (',', $author); | |
// Look for a disambiguation page | |
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0] (disambiguation)"), | |
$author); | |
if ($wikiInfo) { | |
return false; | |
} | |
// Look for an author page | |
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0]"), | |
$author); | |
if ($wikiInfo) { | |
return $wikiInfo; | |
} | |
// Give up! | |
return false; | |
} | |
function biographyFor ($name) | |
{ | |
$result = $this->findBiography ($name); | |
if ($result) { | |
$result['snippet'] = (preg_replace ("/<\/p>.*/s", "</p>", | |
$result['description'])); | |
$result['hasmore'] = | |
!preg_match ('/^\s+$/', | |
substr ($result['description'], | |
strlen ($result['snippet']))); | |
} | |
return $result; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Using it...
This works because Miles Franklin has the word "writer" in the first sentence.
Works because we match on "1938"
php> = $wikipedia->biographyFor ("Knuth, Donald Ervin, 1938-");
Array
(
[name] => Donald Knuth
[image] => http://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/KnuthAtOpenContentAlliance.jpg/150px-KnuthAtOpenContentAlliance.jpg
[description] =>
Donald Ervin Knuth ("Frequently Asked Questions" at Stanford site. Gives the pronunciation of his name as "Ka-NOOTH".) (born January 10, 1938) is a computer scientist and Professor Emeritus of the Art of Computer Programming at Stanford University.Donald Knuth's Homepage at Stanford.
Works because of the word "author" in the first sentence.
php> = $wikipedia->biographyFor ("Thiele, Colin");
Array
(
[name] => Colin Thiele
[description] =>
Colin Milton Thiele, AC (16 November 1920 â 5 September 2006) was an Australian author and educator. He was renowned for his award-winning children's fiction, most notably the novels Storm Boy and Blue Fin.
Gives no match--no life dates or author-ish words...
php> = $wikipedia->biographyFor ("Hitchcock, Alfred");
php>