Skip to content

Instantly share code, notes, and snippets.

@mauro-balades
Forked from marktriggs/Wikipedia.php
Created March 25, 2022 15:31
Show Gist options
  • Save mauro-balades/3768192630801f3b5c19b29389c7907b to your computer and use it in GitHub Desktop.
Save mauro-balades/3768192630801f3b5c19b29389c7907b to your computer and use it in GitHub Desktop.
/* Not the nicest code we ever wrote... */
<?php
require_once 'HTTP/Request.php';
require_once 'Text/Wiki/Mediawiki.php';
class Wikipedia
{
private function zapBetween ($string, $start, $end)
{
$nesting = 0;
$result = "";
$index = 0;
$len = strlen ($string);
while ($index < $len) {
if (strpos ($string, $start, $index) === $index) {
$nesting++;
$index += strlen ($start);
} else if (strpos ($string, $end, $index) === $index) {
$nesting--;
$index += strlen ($end);
} else {
if ($nesting == 0) {
$result = $result . substr ($string, $index, 1);
}
$index++;
}
}
return $result;
}
private function extractImage ($wikitext)
{
$image = array ();
preg_match ("/\n\s?\|\s?image\s?=\s? (.*?)\n/", $wikitext, $image);
$image = $image[1];
if ($image) {
$imgurl = "http://en.wikipedia.org/w/api.php" .
'?prop=imageinfo&action=query&iiprop=url&iiurlwidth=150&format=php' .
'&titles=Image:' . str_replace (' ', '_', $image);
$client = new HTTP_Request ();
$client->setMethod (HTTP_REQUEST_METHOD_GET);
$client->setURL ($imgurl);
$response = $client->sendRequest ();
if ($response = $client->getResponseBody ()) {
if ($imageinfo = unserialize ($response)) {
// Hack for wikipedia api
preg_match ('/\"http:\/\/(.*)\"/', $response, $matches);
$image = 'http://' . substr ($matches[1],
0,
strpos ($matches[1], '"'));
return $image;
}
}
}
return false;
}
private function cleanWikiText ($wikitext)
{
$wikitext = $this->zapBetween ($wikitext, "{{", "}}");
$wikitext = preg_replace ("/\[\[Image:.*?\]\]\n/", "", $wikitext);
if (!preg_match ('/disambiguation/', $result['name'])) {
$wikitext = preg_replace ('/\=\=.*?$/s', '', $wikitext);
}
return $wikitext;
}
private function wikipediaBio ($lookfor, $author)
{
$wiki =& new Text_Wiki_Mediawiki ();
$wiki->setRenderConf ('xhtml', 'wikilink', 'pages', false);
$wiki->setFormatConf ('Xhtml', 'charset', 'utf-8');
$wiki->setFormatConf ('Xhtml', 'translate', false);
$wiki->setRenderConf ('xhtml', 'image', 'base',
'http://en.wikipedia.org/wiki/Image:');
$wiki->setRenderConf ('xhtml', 'wikilink', 'view_url',
'http://en.wikipedia.org/wiki/%s');
$wiki->disableRule ('image');
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&redirects=1&titles=' . $lookfor;
$requestParams = array ('timeout' => 2, 'readTimeout' => array ('4','0'));
$client = new HTTP_Request ('', $requestParams);
$client->setMethod (HTTP_REQUEST_METHOD_GET);
$client->setURL ($url);
PEAR::setErrorHandling (PEAR_ERROR_RETURN);
$response = $client->sendRequest ();
if (!PEAR::isError ($response)) {
$result = array();
$body = @unserialize ($client->getResponseBody ());
//Check if data exists or not
if ($body &&
!isset($body['error']) &&
!isset($body['query']['pages']['-1'])) {
$body = array_shift ($body['query']['pages']);
$result['name'] = $body['title'];
$body = array_shift ($body['revisions']);
$wikitext = $body['*'];
$image_url = $this->extractImage ($wikitext);
if ($image_url) {
$result['image'] = $image_url;
}
$wikitext = $this->cleanWikiText ($wikitext);
$result['description'] = $wiki->transform ($wikitext, 'xhtml');
/* If we have life dates, extract them now... */
$year = false;
if (preg_match ("/[0-9]{4}/", array_pop ($author),
$matches)) {
if (count ($matches) > 0) {
$year = $matches[0];
}
}
$result['provider'] = "Wikipedia";
$result['provider_link'] = "http://www.wikipedia.org/wiki/" . $result['name'];
if (preg_match ('/disambiguation/', $result['name']) ||
preg_match ('/^[^\.]+(author|writer|journalist|novelist)[^\.]*\./',
$wikitext) ||
($year && preg_match ("/$year/", $wikitext))) {
return $result;
}
return false;
}
}
return false;
}
private function findBiography ($name)
{
global $configArray;
$author = preg_replace ("/,$/", "", $name);
/* Remove any parenthetical remarks from the author's name */
$author = $this->zapBetween ($author, "(", ")");
$author = explode (',', $author);
// Look for a disambiguation page
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0] (disambiguation)"),
$author);
if ($wikiInfo) {
return false;
}
// Look for an author page
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0]"),
$author);
if ($wikiInfo) {
return $wikiInfo;
}
// Give up!
return false;
}
function biographyFor ($name)
{
$result = $this->findBiography ($name);
if ($result) {
$result['snippet'] = (preg_replace ("/<\/p>.*/s", "</p>",
$result['description']));
$result['hasmore'] =
!preg_match ('/^\s+$/',
substr ($result['description'],
strlen ($result['snippet'])));
}
return $result;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment