Last active
July 30, 2016 01:14
-
-
Save tim-peterson/36271751df0c4858b3df985bf194d431 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
connection.query('SELECT * from schools where id > 4479 && id < 4490;', function(err, rows, fields) { // and id=35402 | |
if (err) throw err; | |
var len =rows.length; | |
for(var i = 0; i < len; i++){ | |
var school_name = rows[i].name; | |
var parent_org = rows[i].id; | |
var url = "http://www.bing.com/search?q="+encodeURIComponent(school_name+" student organizations"); | |
bingIt(url, school_name, parent_org); | |
if(i==(len -1) ) close_connection=true; | |
} | |
}); | |
function get_each_club_page(links, l, parent_org, second_pass, row_to_update){ | |
var indiv_club_link = (typeof links[l][0].attribs!='undefined' && typeof links[l][0].attribs.href!='undefined') ? links[l][0].attribs.href : ''; | |
if(indiv_club_link=='') return; | |
if(second_pass && indiv_club_link.substring(0,4) !== "http"){ | |
var orig_club_link = indiv_club_link; | |
indiv_club_link = second_pass + '/'+ indiv_club_link; | |
} | |
var description = (typeof links[l][0].prev!='undefined' && links[l][0].prev!=null && typeof links[l][0].prev.data!='undefined' && links[l][0].prev.data!=null) ? links[l][0].prev.data : ''; | |
var name = links[l].text(); | |
name = name.replace(/\s+/g,' ').trim(); | |
if(name=='') return; | |
var contact_link = indiv_club_link; | |
request(indiv_club_link, function (error, response, body) { | |
if (!error && response.statusCode == 200) { | |
var name = links[l].text(); | |
name = name.replace(/\s+/g,' ').trim(); | |
var contact_link = indiv_club_link; | |
var description = ''; | |
var $ = cheerio.load(body); | |
if(second_pass){ | |
var longest_text = ''; | |
$('p, p *, div, div *, li, li *').each(function(k) { | |
if($(this).text().length > longest_text.length){ | |
longest_text = $(this).text().replace(/\s+/g, " ").trim(); | |
var description = longest_text; | |
} | |
}); | |
} | |
var innerText = $('body').text(); | |
if(second_pass){ | |
var innerText = $('*').text(); | |
} | |
var address_arr = address_regex.exec(innerText); | |
if(address_arr!=null && address_arr.length > 0) var address = address_arr[0]; | |
else var address = ''; | |
var numbers = new PhoneNumberParser(); | |
numbers.parse(innerText); | |
var phone_arr = []; | |
if(typeof numbers.items!='undefined'){ | |
for(var ni = 0; ni < numbers.items.length; ni++){ | |
if(numbers.items[ni].length>=9){ | |
phone_arr.push(numbers.items[ni]); | |
} | |
} | |
} | |
var phone = phone_arr.join(); | |
var email = extractEmails(innerText); | |
if(email!=null) email = email.join(); | |
if(second_pass==false){ | |
if(email!=null){ | |
var arr = [name,phone,address,email,parent_org,description,contact_link]; //geo, ,geo = ? | |
} | |
else{ | |
var arr = [name,phone,address,email+'__randomizer__'+ Math.random(),parent_org,description,contact_link]; | |
} | |
connection.query('INSERT organization_leads_clubs SET name = ?,phone = ?,address = ?,email = ?,parent_org = ?,description = ?,website = ?', arr, function(err, rows, fields) { | |
if(err) { | |
return | |
} | |
if(email==null){ | |
$('a').each(function(item) { | |
if(typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined' && $(this)[0].attribs.href.toLowerCase().indexOf('contact') > -1){ | |
var alt_indiv_club_link = [$(this)]; // (typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined') ? $(this)[0].attribs.href : ''; | |
get_each_club_page(alt_indiv_club_link, 0, parent_org, contact_link, rows.insertId); | |
return false; | |
} | |
//resume(); | |
}); | |
$('a').each(function(r){ | |
if(typeof $(this)[0].attribs!='undefined' && typeof $(this)[0].attribs.href!='undefined' && $(this)[0].attribs.href.toLowerCase().indexOf('contact') > -1){ | |
var alt_indiv_club_link = [$(this)]; | |
get_each_club_page(alt_indiv_club_link, 0, parent_org, contact_link, rows.insertId); | |
return false; | |
} | |
}); | |
} | |
console.log('INSERT organization_leads_clubs first pass'); | |
}); | |
} //if(contact_link | |
else if(second_pass){ | |
var arr = [phone,address,email,description,contact_link,row_to_update]; //contact_link | |
if(email==null) var arr = [phone,address,email+'__randomizer__'+ Math.random(),description,contact_link,row_to_update]; | |
connection.query('UPDATE organization_leads_clubs SET phone = ?,address = ?,email = ?,description = ?,contact_link = ? where id = ?', arr, function(err, rows, fields) { | |
if(err) { | |
return | |
} | |
console.log('UPDATE organization_leads_clubs'+second_pass); | |
return; | |
}); | |
} | |
} // !if not error | |
}); //request | |
} | |
function bingIt(url, school_name, parent_org){ | |
var tRequest = throttledRequest({url : url, headers : headers}, function (error, response, body) { | |
//console.log("inside bing: "); | |
if (error) { | |
//console.log("Couldn’t get page because of error: " + error); | |
return; | |
} | |
var $ = cheerio.load(body), | |
all_links = $(".b_algo > h2 > a"), | |
first_link = $(".b_algo > h2 > a").first(), | |
website = ''; | |
var override = false; | |
for(var a=0; a < 9; a++){ | |
if(typeof all_links[a]!='undefined'){ | |
var website0 = typeof all_links[a].attribs!='undefined' ? all_links[a].attribs.href : ''; | |
if(website0.toLowerCase().indexOf('organizations') > -1){ | |
override = website0; | |
break; | |
//console.log('override bing link with: '+override); | |
} | |
if(override==false){ | |
if(website0.toLowerCase().indexOf('clubs') > -1){ | |
override = website0; | |
break; | |
} | |
} | |
} | |
} | |
if(override) website = override; | |
else{ | |
if(typeof first_link[0]!='undefined'){ | |
website = typeof first_link[0].attribs!='undefined' ? first_link[0].attribs.href : ''; | |
} | |
} | |
if(website==''){ | |
//console.log('no website, die'); | |
return; | |
} | |
request(website, function (error, response, body) { | |
if (!error && response.statusCode == 200) { | |
var $ = cheerio.load(body); | |
var links = find_club_links($); | |
for(var l = 0; l < links.length; l++){ | |
get_each_club_page(links, l, parent_org, false, false); | |
} //for each club link | |
} // !if not error | |
}); //request | |
}); //throttledrequest(url to Bing) | |
} //bingIt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment