Last active
January 3, 2017 21:20
-
-
Save jamlfy/2c1b14f64a75f59a0a0b62ede0180adb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var webPage = require('webpage'); | |
var HOME = 'http://my.domain.com/'; | |
var stepIndex = 0; | |
var HOME_CONTENT = 'Where ara the info in home'; | |
var POST_MENU = 'Go to menu'; | |
var TOP_MENU = 'Top Menu'; | |
var CONTENT_POST = 'Where are the info in pages'; | |
var PAGES = {}; | |
var POST = []; | |
function posibleError (msg, line, source) { | |
console.log('>', msg); | |
} | |
function posibleAlert (msg) { | |
console.log('<', msg); | |
} | |
function getPostOrPages (content) { | |
function clean (node) { | |
var newNode = []; | |
for (var i = 0; i < node.length; i++) { | |
if(node[i] && !node[i].body && node[i].tagName ){ | |
if(node[i].tagName == 'DIV'){ | |
var z = clean($(node[i].innerHTML.replace(/\t|\n|\s{2,}/gim, '').replace(/(\<br\>)/gim, '</p><p>'))); | |
for (var w = 0; w < z.length; w++) { | |
newNode.push(z[w]); | |
} | |
} else if(/P|H[1-6]|UL|I|TABLE|LI|IMG/i.test(node[i].tagName)){ | |
newNode.push(node[i]); | |
} | |
} | |
} | |
return newNode; | |
} | |
var post = { text : [] }; | |
var child = clean($(content).children()); | |
for (var i = 0; i < child.length; i++) { | |
if(child[i] && child[i].tagName != 'BR' && child[i].tagName != 'H1' && !child[i].body && child[i].innerHTML.length ){ | |
post.text.push(child[i].outerHTML); | |
} else if ( child[i] && child[i].tagName == 'H1' ) { | |
post.title = child[i].innerText; | |
} else if ( child[i] && child[i].tagName == 'I' && !post.autor ) { | |
post.autor = child[i].innerText; | |
} | |
} | |
return post; | |
} | |
function getHome(content) { | |
var data = []; | |
var post = {}; | |
var cild = $(content).children(); | |
for (var i = 0; i < cild.length; i++) { | |
if(cild[i].tagName === 'H2'){ | |
if(post.title){ | |
data.push(post); | |
} | |
post = { title : cild[i].innerText, text : [] }; | |
} else { | |
if(post.text && cild[i].tagName != 'BR' ){ | |
if(cild[i].tagName != 'I'){ | |
post.text.push(cild[i].outerHTML.replace(/\t|\n|(\s){2,}|(\<br\>){2,}/gim, '')); | |
} else { | |
post.autor = cild[i].innerText; | |
} | |
} | |
} | |
} | |
if(post.title){ | |
data.push(post); | |
} | |
function posibleError (msg, line, source) { | |
console.log('>', msg); | |
} | |
function posibleAlert (msg) { | |
console.log('<', msg); | |
} | |
function getPostOrPages (name) { | |
function clean (node) { | |
var newNode = []; | |
for (var i = 0; i < node.length; i++) { | |
if(node[i] && !node[i].body && node[i].tagName ){ | |
if(node[i].tagName == 'DIV'){ | |
var z = clean($(node[i].innerHTML.replace(/\t|\n|\s{2,}/gim, '').replace(/(\<br\>)/gim, '</p><p>'))); | |
for (var w = 0; w < z.length; w++) { | |
newNode.push(z[w]); | |
} | |
} else if(/P|H[1-6]|UL|I|TABLE|LI|IMG/i.test(node[i].tagName)){ | |
newNode.push(node[i]); | |
} | |
} | |
} | |
return newNode; | |
} | |
var post = { text : [] }; | |
var child = clean($(name).children()); | |
for (var i = 0; i < child.length; i++) { | |
if(child[i] && child[i].tagName != 'BR' && child[i].tagName != 'H1' && !child[i].body && child[i].innerHTML.length ){ | |
post.text.push(child[i].outerHTML); | |
} else if ( child[i] && child[i].tagName == 'H1' ) { | |
post.title = child[i].innerText; | |
} else if ( child[i] && child[i].tagName == 'I' && !post.autor ) { | |
post.autor = child[i].innerText; | |
} | |
} | |
return post; | |
} | |
function getHome(name) { | |
var data = []; | |
var post = {}; | |
var cild = $(name).children(); | |
for (var i = 0; i < cild.length; i++) { | |
if(cild[i].tagName === 'H2'){ | |
if(post.title){ | |
data.push(post); | |
} | |
post = { title : cild[i].innerText, text : [] }; | |
} else { | |
if(post.text && cild[i].tagName != 'BR' ){ | |
if(cild[i].tagName != 'I'){ | |
post.text.push(cild[i].outerHTML.replace(/\t|\n|(\s){2,}|(\<br\>){2,}/gim, '')); | |
} else { | |
post.autor = cild[i].innerText; | |
} | |
} | |
} | |
} | |
if(post.title){ | |
data.push(post); | |
} | |
return data; | |
} | |
function getPost (name) { | |
var urs = []; | |
var items = $(name); | |
for (var i = 0; i < items.length; i++) { | |
urs.push(items[i].href); | |
} | |
return urs; | |
} | |
function getMenu (i, top) { | |
return $(top)[i].href; | |
} | |
function startPage(url, isPost) { | |
var page = webPage.create(); | |
page.onConsoleMessage = posibleError; | |
page.onAlert = posibleAlert; | |
stepIndex++; | |
if(url){ | |
page.open(url, function(status){ | |
if (status === 'success') { | |
console.log('Start :', url); | |
page.injectJs('jquery.min.js'); | |
var newUrl; | |
if(!phantom.state){ | |
if(isPost || url.indexOf('index.html') < 0){ | |
PAGES[ url ] = page.evaluate(getPostOrPages, CONTENT_POST); | |
} | |
if(isPost){ | |
PAGES[ url ].post = true; | |
newUrl = POST[stepIndex]; | |
} else { | |
if(url.indexOf('index.html') >= 0){ | |
POST = page.evaluate(getPost, POST_MENU); | |
PAGES[ url ] = page.evaluate(getHome, HOME_CONTENT); | |
PAGES[ url ].list = true; | |
} else { | |
PAGES[ url ].page = true; | |
} | |
newUrl = page.evaluate(getMenu, stepIndex, TOP_MENU)); | |
} | |
if(!newUrl){ | |
stepIndex = 0; | |
newUrl = POST[stepIndex]; | |
isPost = true; | |
} | |
startPage(newUrl, isPost); | |
} else { | |
phantom.state(); | |
} | |
} | |
}); | |
} else { | |
console.log(JSON.stringify(PAGES, null, '\t')); | |
phantom.kill(); | |
} | |
} | |
startPage(HOME + 'index.html'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment