Last active
August 29, 2015 14:18
-
-
Save abernardobr/fa5c5240dd6c860124a3 to your computer and use it in GitHub Desktop.
Node URL Crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Client file | |
//***** hdCrawl ******// | |
// dependencies | |
// * lodash or underscore | |
// * jQuery | |
// Providers | |
hdCrawlProviders = { | |
youtube: { | |
regexp: /(?:.+?)?(?:\/v\/|watch\/|\?v=|\&v=|youtu\.be\/|\/v=|^youtu\.be\/|y2u\.be\/|^y2u\.be\/)([a-zA-Z0-9_-]{11})+/i, | |
dimensions: { width: 640, height: 390 }, | |
authKey: 'your_google_authetication_key_goes_here', | |
match: function(url) { | |
return this.regexp.test(url); | |
}, | |
preview: function (url, input, cb) { | |
var self = this; | |
if(input.match(self.regexp)) { | |
$.getJSON('https://www.googleapis.com/youtube/v3/videos?id=' + RegExp.$1 + '&key=' + self.authKey + '&part=snippet,statistics').success(function (d) { | |
var video = {}; | |
var ytData = d.items[0]; | |
video.host = 'youtube.com'; | |
video.title = ytData.snippet.title; | |
video.imageCount = 1; | |
video.images = [ytData.snippet.thumbnails.medium.url]; | |
video.description = (ytData.snippet.description.trunc(250, true)).replace(/\n/g, ' ').replace(/ /g, ' '); | |
video.rawDescription = ytData.snippet.description; | |
video.views = ytData.statistics.viewCount; | |
video.likes = ytData.statistics.likeCount; | |
video.url = 'https://www.youtube.com/watch?v=' + RegExp.$1; | |
video.width = self.dimensions.width; | |
video.height = self.dimensions.height; | |
video.id = ytData.id; | |
video.source = 'youtube'; | |
cb(input, video); | |
}); | |
} else { | |
cb(input, '', {}); | |
} | |
} | |
}, | |
vimeo: { | |
regexp: /https?:\/\/(?:www\.)?vimeo.com\/(?:channels\/(?:\w+\/)?|groups\/([^\/]*)\/videos\/|album\/(\d+)\/video\/|)(\d+)(?:$|\/|\?)*/i, | |
dimensions: { width: 640, height: 390 }, | |
match: function(url) { | |
return this.regexp.test(url); | |
}, | |
preview: function(url, input, cb) { | |
var self = this; | |
if(input.match(self.regexp)) { | |
$.getJSON('https://vimeo.com/api/v2/video/' + RegExp.$3 + '.json').success(function (d) { | |
var video = {}; | |
video.host = 'vimeo.com'; | |
video.title = d[0].title; | |
video.rawDescription = (d[0].description).replace(/\n/g, '<br/>').replace(/ /g, '<br/>'); | |
video.description = (d[0].description).replace(/((<|<)br\s*\/*(>|>)\r\n)/g, ' ').trunc(250, true); | |
video.imageCount = 1; | |
video.images = [d[0].thumbnail_medium]; | |
video.views = d[0].stats_number_of_plays; | |
video.likes = d[0].stats_number_of_likes; | |
video.url = d[0].url; | |
video.width = self.dimensions.width; | |
video.height = self.dimensions.height; | |
video.id = d[0].id; | |
video.source = 'vimeo'; | |
cb(input, video); | |
}); | |
} | |
} | |
}, | |
url: { | |
regexp: /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/i, | |
dimensions: { width: 640, height: 390 }, | |
match: function(url) { | |
return this.regexp.test(url); | |
}, | |
preview: function(url, input, cb) { | |
var self = this; | |
// This is an internal function that just calls the Node crawling function | |
// I use HAPI to create my routes to reach the node code. Here you can use anything you want... | |
HDV.crawlUrl(url, function(urlData) { | |
urlData.width = self.dimensions.width; | |
urlData.height = self.dimensions.height; | |
cb(input, urlData); | |
}); | |
} | |
} | |
} | |
// Crawler object to be used | |
// * Things to inproove | |
// * crawl more than one URL | |
// * define provider order | |
// * return more a match url --> provider list, so we can perform more than one crawlling | |
function hdCrawl() { | |
var self = this; | |
self.providers = []; | |
} | |
// in: | |
// url --> calls the providers match function to see if the url matchs the matching function (general a regexp, but it could be anything you need to check the match) | |
// we follow the hdCrawlProviders order. The first that matches is returned | |
// out: | |
// the provider if we matched any or null if no providers were found | |
// | |
hdCrawl.prototype.getProvider = function(url) { | |
var self = this; | |
self.providers = self.providers.length > 0 ? self.providers : _.keys(hdCrawlProviders); | |
for (var i = 0; i < self.providers.length; i++) { | |
var provider = hdCrawlProviders[self.providers[i]]; | |
if(provider.match(url)) | |
return provider; | |
} | |
return null; | |
} | |
// in: | |
// * input --> the text to crawl | |
// out (via callback): | |
// * input, if modified by the provider | |
// * the crawledInfo | |
// var retData = { | |
// host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com | |
// title: internals.checkEmpty(title, ''), // the crawled title | |
// description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters | |
// rawDescription: internals.checkEmpty(description, ''), // the crawled description | |
// imageCount: aImages.length, // image count | |
// images: aImages, // the array of image urls collected | |
// views: 0, // the views that the video has | |
// likes: 0, // number of likes of the video | |
// url: uri, // the actual incomming url | |
// width: 0, // base width and heigth of the video | |
// height: 0, // base width and heigth of the video | |
// id: 0, // video id | |
// source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter... | |
// }; | |
hdCrawl.prototype.crawl = function(input, cb) { | |
var self = this; | |
// first check if we have a URLs to process | |
var urlRegex = /((href|src)=["']|)(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig; | |
var urls = input.match(urlRegex); | |
if(urls == null || urls.length === 0) { | |
cb(input, '', {}); | |
} else { | |
// get and call the provider | |
var url = urls[0]; | |
var provider = self.getProvider(url); | |
if(provider != null && url !== '') { | |
provider.preview(url, input, function(input, crawledInfo) { | |
cb(input, crawledInfo); | |
}); | |
} else { | |
cb(input, '', {}); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Node file | |
// Dependencies | |
var _ = require('lodash'); | |
var Request = require('request'); | |
var Cheerio = require('cheerio'); | |
var URL = require('url'); | |
// ********************************************************************** | |
// Things to improove: | |
// * Separate text crawling and image crawling for a better performance | |
// * Create crawling providers, like: url, vimeo, youtube, etc. | |
// * Add caching: in memory, redis, mongodb, memcached. | |
// The caching would be url --> parsed url data (depending on your storage architecture, the best storages would be key/pair like Redis, memcached, etc. | |
// * Add expiration to the cache | |
// ********************************************************************** | |
internals = { | |
tagImageLimit: 10, // if we do not have at least imageLimit images until we reach collecting from img tags, collect than this limit of img tags | |
imageLimit: 5, // how many images do we want in general | |
descriptionSizeLimite: 100 // quantity of characters to limit the description | |
}; | |
// General helper functions | |
internals.isEmpty = function(value) { | |
return (value == null || value === '' || _.isUndefined(value) || _.isEmpty(value)); | |
} | |
internals.checkEmpty = function(value, newValue) { | |
return (value != null && value !== '' && !_.isUndefined(value) && !_.isEmpty(value)) ? value : newValue; | |
} | |
internals.addImage = function(aImages, image, urlParsed) { | |
var regExp = /^(https?:\/\/)?((([a-z\d]([a-z\d-]*[a-z\d])*)\.)+[a-z]{2,}|((\d{1,3}\.){3}\d{1,3}))(\:\d+)?(\/[-a-z\d%_@.~+&=!#$%\*\(\)<>?]*)*(\?[;&a-z\d%_@.~+&=!#$%\*\(\)<>?]*)?(\#[-a-z\d_]*)?$/i; | |
var addImage = false; | |
if(aImages.length < internals.tagImageLimit) { | |
if (!regExp.test(image)) { | |
if (image != null && image !== '' && !_.isUndefined(image) && !_.isEmpty(image) && image.length >= 2) { | |
// there are cases where CDNs start with //url.com/image, we do NOT accept this crazyness | |
if (image[1] !== '/') { | |
var imgUrlParsed = URL.parse(image); | |
// check for relative url starting with / or without it (that is, just the image name and the path to it), than add the host before | |
if (imgUrlParsed.protocol == null || imgUrlParsed.protocol === '') { | |
image = urlParsed.protocol + "//" + urlParsed.host + (image[0] === '/' ? image : ('/' + image)); | |
addImage = true; | |
} | |
} | |
} | |
} else | |
addImage = true; | |
} | |
// we cannot check in this case for image extensions. More crazyness, some CDNs have a path to an image without the extension (arghhhhhhh) --> github is one of them | |
if(addImage) | |
aImages.push(image); | |
return aImages; | |
} | |
internals.checkImage = function(image) { | |
if(!_.isEmpty(image) && image !== '') | |
return image; | |
return ''; | |
} | |
internals.elipses = function(value, size) { | |
var plainText = value.trim(); | |
if(plainText.length > size) | |
return plainText.substr(0,size) + "…"; | |
return plainText.replace(/((<|<)br\s*\/*(>|>)\r\n)/g, ' '); | |
} | |
// Crawling functions | |
internals.crawl = function(uri, cb) { | |
Request({ uri: uri }, function(err, response, body) { | |
var noOpRetData = { | |
host: '', title: '', rawDescription: '', description: '', | |
imageCount: 0, images: [], views: 0, likes: 0, url: '', width: 0, height: 0, id: 0, source: 'url', sourceUrl: uri | |
}; | |
if(err || (response && response.statusCode !== 200) || body == null || body === '') { | |
cb(null, noOpRetData); | |
return; | |
} | |
var $; | |
try { | |
$ = Cheerio.load(body); | |
var urlParsed = URL.parse(uri); | |
// Get the title | |
var title = internals.checkEmpty('', $("meta[property='og:title']").attr("content")); | |
if(internals.isEmpty(title)) | |
title = $("meta[name='title']").attr("content"); | |
if(internals.isEmpty(title)) | |
title = $("title").text(); | |
// Get the description | |
var description = internals.checkEmpty('', $("meta[name='description']").attr("content")); | |
if(internals.isEmpty(description)) | |
description = $("meta[property='og:description']").attr("content"); | |
if(internals.isEmpty(description)) | |
description = $("meta[property='twitter:description']").attr("content"); | |
if(internals.isEmpty(description)) | |
description = $('h1').html(); | |
// Get Images | |
var aImages = []; | |
var gotFromOGorTwitter = false; // try to get images from OpenGraph or Twitter. If there is, than use just them | |
// meta:name can be used | |
$("meta[name='og:image']").each(function(i, elem) { | |
var imageTmp = internals.checkImage($(elem).attr('content')); | |
if(imageTmp !== '') | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
}); | |
// Limit to 5 images | |
if(aImages.length < internals.imageLimit) { | |
// meta:property is more comonly used | |
$("meta[property='og:image']").each(function (i, elem) { | |
var imageTmp = internals.checkImage($(elem).attr('content')); | |
if (imageTmp !== '') | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
}); | |
} | |
// Try to get througth twitter meta tags | |
if(aImages.length === 0) { | |
// get twitter:image | |
$("meta[name='twitter:image']").each(function(i, elem) { | |
var imageTmp = internals.checkImage($(elem).attr('content')); | |
if(imageTmp !== '') | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
}); | |
} | |
if(aImages.length === 0) { | |
// get twitter:image:src | |
$("meta[name='twitter:image:src']").each(function(i, elem) { | |
var imageTmp = internals.checkImage($(elem).attr('content')); | |
if(imageTmp !== '') | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
}); | |
} | |
// if we gotten images from twitter or open graph, than stop here. No need to parse any longer | |
// also, if we have already 5 (internals.imageLimit) images, than stop | |
gotFromOGorTwitter = aImages.length > 0; | |
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) { | |
// there can be up 10 (internals.tagImageLimit) images only if we parse from img tags | |
$("img").each(function (i, elem) { | |
var $el = $(elem); | |
// check for lazyload --> UOL uses it (more originality) | |
var imageTmp = $el.attr('data-original'); | |
if(_.isEmpty(imageTmp)) | |
imageTmp = internals.checkImage($el.attr('src')); | |
if (imageTmp !== '') | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
}); | |
} | |
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) { | |
// get background url or background-image if we have not gotten any images yet | |
$('[style]').each(function (i, elem) { | |
var aImages = /url\(.*?\)/ig.exec($(elem).html()); | |
if (aImages && aImages.length > 0) { | |
for (var i = 0; i < aImages.length; i++) { | |
var item = aImages[i]; | |
var imageTmp; | |
imageTmp = item.substring(item.indexOf('(') + 1, item.indexOf(')')); | |
// limit only images with extensions known. Crazy CDNs will be excluded. | |
if (imageTmp.indexOf('jpg') !== -1 || imageTmp.indexOf('png') !== -1 || | |
imageTmp.indexOf('gif') !== -1 || imageTmp.indexOf('jpeg') !== -1) { | |
if (imageTmp.indexOf("'") === -1 && imageTmp.indexOf("apos") === -1) { | |
aImages = internals.addImage(aImages, imageTmp, urlParsed); | |
} | |
} | |
} | |
} | |
}); | |
} | |
if(!gotFromOGorTwitter && aImages.length < internals.imageLimit) { | |
// some dudes also use meta:itemprop --> such good imagination (Google is one of them) | |
aImages = internals.addImage(aImages, $("meta[itemprop='image']").attr("content"), urlParsed); | |
} | |
// Crazy little trick. After making sure images do not repest (unique), reverse the array. Why? You might ask... | |
// We assume that the most intersting images are the ones in the middle of the page. Reversing, makes sure that we get first the | |
// most intersting ones. | |
aImages = _.unique(aImages).reverse(); | |
var retData = { | |
host: internals.checkEmpty(urlParsed.host, ''), // the calling host. ex. http://google.com --> google.com | |
title: internals.checkEmpty(title, ''), // the crawled title | |
description: internals.elipses(internals.checkEmpty(description, ''), internals.descriptionSizeLimite), // the description limited by descriptionSizeLimite characters | |
rawDescription: internals.checkEmpty(description, ''), // the crawled description | |
imageCount: aImages.length, // image count | |
images: aImages, // the array of image urls collected | |
views: 0, // used only for video crawling (it is here for compatibility when we add video crawling here | |
likes: 0, // used only for video crawling (it is here for compatibility when we add video crawling here | |
url: uri, // the actual incomming url | |
width: 0, // used only for video crawling (it is here for compatibility when we add video crawling here | |
height: 0, // used only for video crawling (it is here for compatibility when we add video crawling here | |
id: 0, // used only for video crawling (it is here for compatibility when we add video crawling here | |
source: 'url' // just to reference that we crawled an URL. In the future we might add more types of crawling like youtube, vimeo, twitter... | |
}; | |
cb(null, retData); | |
} catch(ex) { | |
cb(null, noOpRetData); | |
} | |
}); | |
} | |
module.exports = { | |
parse: internals.crawl | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment