Created
June 20, 2015 17:44
-
-
Save tomayac/242a3f48ca84c6e21ce3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
var async = require('async'); | |
var request = require('request'); | |
var ExpontentialSmoothingStream = require('exponential-smoothing-stream'); | |
var numbers = require('numbers'); | |
var geolib = require('geolib'); | |
var util = require ('./util.js'); | |
var REDIRECTS_URL = 'http://{{LANGUAGE}}.wikipedia.org/w/api.php?action=query' + | |
'&blnamespace=0&list=backlinks&blfilterredir=redirects&bllimit=max&' + | |
'format=json&bltitle='; | |
var LANGUAGE_LINKS_URL = '.wikipedia.org/w/api.php?action=query&' + | |
'prop=langlinks&format=json&lllimit=max&titles='; | |
var GEO_COORDINATES_URL = '.wikipedia.org/w/api.php?action=query&' + | |
'prop=coordinates&format=json&colimit=max&coprop&coprimary=primary&titles='; | |
var REVISIONS_URL = '.wikipedia.org/w/api.php?action=query&format=json' + | |
'&rvstart={{rvstart}}&prop=revisions&rvprop=timestamp|user&rvlimit=max' + | |
'&rvdir=newer&titles='; | |
var USER_AGENT = | |
'Disaster Monitor * Contact: Thomas Steiner ([email protected])'; | |
var HEADERS = { 'User-Agent': USER_AGENT }; | |
var PARALLEL_LIMIT = 5; | |
var wikipedia = { | |
getRevisions: function(language, article, callback) { | |
console.log('Getting revisions of ' + language + ':' + article + '.'); | |
var url = 'http://' + language + LANGUAGE_LINKS_URL + | |
encodeURIComponent(article); | |
var options = { | |
url: url, | |
headers: HEADERS, | |
timeout: 5000 | |
}; | |
request.get(options, function(err, response, body) { | |
if (err || response.statusCode !== 200) { | |
return callback(err || 'Internal ServerError'); | |
} | |
var data = JSON.parse(body); | |
if (!data.query || !data.query.pages) { | |
return callback(err || 'Internal Server Error'); | |
} | |
var pageId = Object.keys(data.query.pages)[0]; | |
if (!data.query.pages[pageId].langlinks) { | |
return callback('File Not Found'); | |
} | |
data.query.pages[pageId].langlinks.push({ | |
lang: language, | |
'*': article | |
}); | |
var functions = {}; | |
var yesterday = new Date(Date.now() - (24 * 60 * 60 * 1000)).toISOString(); | |
data.query.pages[pageId].langlinks.forEach(function(langLink) { | |
var title = langLink.lang + ':' + decodeURIComponent(langLink['*']); | |
functions[title] = function(innerCallback) { | |
var innerOptions = { | |
url: 'http://' + langLink.lang + REVISIONS_URL | |
.replace(/\{\{rvstart\}\}/, yesterday) + | |
encodeURIComponent(langLink['*']), | |
headers: HEADERS, | |
timeout: 5000 | |
}; | |
request.get(innerOptions, function(err, response, body) { | |
if (err || response.statusCode !== 200) { | |
return innerCallback(err || 'Error ' + response.statusCode); | |
} | |
var innerData = JSON.parse(body); | |
if (!innerData.query || !innerData.query.pages) { | |
return innerCallback(null, []); | |
} | |
var pageId = Object.keys(innerData.query.pages)[0]; | |
if (!innerData.query.pages[pageId].revisions) { | |
return innerCallback(null, []); | |
} | |
var revisions = []; | |
innerData.query.pages[pageId].revisions.forEach(function(revision, i) { | |
revisions[i] = { | |
user: revision.user, | |
timestamp: new Date(revision.timestamp).getTime(), | |
date: revision.timestamp, | |
article: title | |
}; | |
}); | |
return innerCallback(null, revisions); | |
}); | |
}; | |
}); | |
async.parallelLimit( | |
functions, | |
PARALLEL_LIMIT, | |
function(err, results) { | |
if (err) { | |
return callback('Internal Server Error'); | |
} | |
var revisions = []; | |
for (article in results) { | |
var revision = results[article]; | |
revisions = revisions.concat(revision); | |
} | |
revisions.sort(function(a, b) { | |
return b.timestamp - a.timestamp; | |
}); | |
var intervals = []; | |
var ess = new ExpontentialSmoothingStream({ smoothingFactor: 0.5 }); | |
ess.on('data', function(data) { | |
intervals.push(data); | |
}); | |
ess.on('end', function() { | |
var standardDeviation = numbers.statistic.standardDev(intervals); | |
var spiking = false; | |
if ((intervals.length >= 5) && | |
(intervals[intervals.length - 1] < standardDeviation / 2)) { | |
spiking = true; | |
} | |
return callback(null, { | |
revisions: revisions, | |
intervals: intervals, | |
spiking: spiking | |
}); | |
}); | |
revisions.forEach(function(revision, i) { | |
if (i > 0) { | |
ess.write(parseInt(revisions[i - 1].timestamp, 10) - | |
parseInt(revision.timestamp, 10)); | |
} | |
}); | |
ess.end(); | |
} | |
); | |
}); | |
}, | |
getGeolocation: function(language, article, callback) { | |
console.log('Geo-referencing ' + language + ':' + article + '.'); | |
var url = 'http://' + language + LANGUAGE_LINKS_URL + | |
encodeURIComponent(article); | |
var options = { | |
url: url, | |
headers: HEADERS, | |
timeout: 5000 | |
}; | |
console.log('Geo-referencing ' + language + ':' + article + ': ' + url) | |
request.get(options, function(err, response, body) { | |
console.log('Received language links ' + url); | |
console.log('url '+options.url) | |
console.log('2 err '+err) | |
if (response) console.log('status ' +response.statusCode) | |
console.log('body ' +body) | |
if (err || response.statusCode !== 200) { | |
return callback('Internal Server Error'); | |
} | |
var data = JSON.parse(body); | |
if (!data.query || !data.query.pages) { | |
return callback('Internal Server Error'); | |
} | |
var pageId = Object.keys(data.query.pages)[0]; | |
if (!data.query.pages[pageId].langlinks) { | |
return callback('File Not Found'); | |
} | |
var functions = {}; | |
data.query.pages[pageId].langlinks.push({ | |
lang: language, | |
'*': article | |
}); | |
data.query.pages[pageId].langlinks.forEach(function(langLink) { | |
var title = langLink.lang + ':' + decodeURIComponent(langLink['*']); | |
functions[title] = function(innerCallback) { | |
var innerOptions = { | |
url: 'http://' + langLink.lang + GEO_COORDINATES_URL + langLink['*'], | |
headers: HEADERS, | |
timeout: 5000 | |
}; | |
console.log('Geo coordinates ' + title + ' => ' + innerOptions.url); | |
request.get(innerOptions, function(err, response, body) { | |
console.log('Received geo coordinates ' + innerOptions.url); | |
console.log(innerOptions.url) | |
console.log('3 err '+err) | |
if (response) console.log('status ' +response.statusCode) | |
console.log('body ' +body) | |
if (err || response.statusCode !== 200) { | |
return innerCallback(err || 'Error ' + response.statusCode); | |
} | |
var innerData = JSON.parse(body); | |
if (!innerData.query || !innerData.query.pages) { | |
return innerCallback(null, []); | |
} | |
var pageId = Object.keys(innerData.query.pages)[0]; | |
if (!innerData.query.pages[pageId].coordinates) { | |
return innerCallback(null, []); | |
} | |
var coordinates = {}; | |
innerData.query.pages[pageId].coordinates.forEach(function(geo) { | |
// O(1) coordinates deduplication | |
coordinates[geo.lat + '|' + geo.lon] = true; | |
}); | |
return innerCallback(null, Object.keys(coordinates).map(function(geo) { | |
var coords = geo.split('|'); | |
return { | |
lat: coords[0], | |
lon: coords[1] | |
}; | |
})); | |
}); | |
}; | |
}); | |
async.parallelLimit( | |
functions, | |
PARALLEL_LIMIT, | |
function(err, results) { | |
if (err) { | |
return callback('Internal Server Error'); | |
} | |
var coordinates = {}; | |
for (var article in results) { | |
var geoArray = results[article]; | |
geoArray.forEach(function(geo) { | |
// O(1) coordinates deduplication | |
coordinates[geo.lat + '|' + geo.lon] = true; | |
}); | |
} | |
coordinates = Object.keys(coordinates).map(function(geo) { | |
geo = geo.split('|'); | |
return { | |
lat: parseFloat(geo[0]), | |
lon: parseFloat(geo[1]), | |
map: util.createGoogleMapsUrl(geo[0], geo[1]) | |
}; | |
}); | |
var averageCoordinates = {}; | |
if (coordinates.length) { | |
var geocoordinates = []; | |
coordinates.forEach(function(coordinate, i) { | |
geocoordinates[i] = { | |
latitude: coordinate.lat, | |
longitude: coordinate.lon | |
}; | |
}); | |
var avg = geolib.getCenter(geocoordinates); | |
averageCoordinates.lat = avg.latitude; | |
averageCoordinates.lon = avg.longitude; | |
averageCoordinates.map = util.createGoogleMapsUrl( | |
averageCoordinates.lat, averageCoordinates.lon); | |
} | |
coordinates = { | |
individualCoordinates: coordinates, | |
averageCoordinates: averageCoordinates, | |
}; | |
return callback(null, coordinates); | |
} | |
); | |
}); | |
}, | |
getRedirects: function(language, article, callback) { | |
console.log('Getting redirects for ' + language + ':' + article + '.'); | |
var options = { | |
url: REDIRECTS_URL.replace(/\{\{LANGUAGE\}\}/, language) + article | |
.replace(/\s/g, '_'), | |
headers: HEADERS, | |
timeout: 5000 | |
}; | |
request.get(options, function(err, response, body) { | |
if (err || response.statusCode !== 200) { | |
return callback('Internal Server Error'); | |
} | |
var data = JSON.parse(body); | |
var results = [article]; | |
if ((data.query && data.query.backlinks) && | |
(Array.isArray(data.query.backlinks)) && | |
(data.query.backlinks.length)) { | |
var backlinks = data.query.backlinks; | |
backlinks.forEach(function(backlink, i) { | |
results[i + 1] = backlink.title; | |
}); | |
} | |
return callback(null, results); | |
}); | |
} | |
}; | |
module.exports = wikipedia; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment