rolfen · March 29, 2024 20:00
diff --git a/crawler.js b/crawler.js
 let getElement, crawlDom, getCats, save, rowify; // functions
 let urlBase, pages; // input data

 // config
 let waitInterval = 500; // how much to wait between fetching pages

 // selectors
 let catRe = /\{\s*productcategories\s*\:\s*(\[\s*\{[\s\S]*\}\s*\])\s*\}/m;
 let selectors = {
 	'name'	: '#showroomTopContentDiv .showroom-header h1',
 	'desc'	: '#scroll-description .showroom-about',
 	'booth'	: '#scroll-boothlinks #newfloorplanlink strong',
 };

 class Output {
 	data =[];
 	counters = {};
 	timers = [];
 	theLog = [];
 	stop = () => {
 		this.timers.forEach(t=>clearInterval(t));
 	}
 	increment = (counter) => {
 		(this.counters[counter] === undefined) && (this.counters[counter]=0);
 		this.counters[counter] ++;  
 	}
 	log = (msg) => {
 		this.theLog.push(msg);
 	}
 	append = (obj) => this.data.push(obj);
 	get = () => this.data;
 }

 crawlDom = (selectors, dom) => Object.keys(selectors).map(k => [k, [...dom.querySelectorAll(selectors[k])].map(e => e.textContent.trim())]);

 getCats = (document, re) => {
 	// extracts categories from source - from inside script tags
 	let cat;
 	let scriptTags = [...(document.querySelectorAll('script'))];
 	let matches = scriptTags.map(e=>e.innerText).map(s=>s.match(re)).filter(m=>m!==null);
 	if (matches.length > 0 && matches[0].length > 1) {
 		eval("cat="+matches[0][1]);
 	} else {
 		cat = [null];
 	}
 	return cat;
 }

 rowify = (obj) => {
 	// explodes parsed data into array with one entry for each subcategory
 	// also flattens array data inside properties
 	return obj.categories.map((cat) => ({
 		url: obj.url,
 		category: cat?.category,
 		subcategory: cat?.subcategory,
 		name: obj.name[0],
 		desc: obj.desc[0],
 		booth: obj.booth.join(', ')
 	}));
 }

 save = (obj, filename='data.json') => {
 	// save to downloadable file

 	let data = JSON.stringify(obj, undefined, 4)

    var blob = new Blob([data], { type: 'text/json'}),
        e = document.createEvent('MouseEvents'),
        a = document.createElement('a')

    a.download = filename
    a.href = window.URL.createObjectURL(blob)
    a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
    e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
    a.dispatchEvent(e)
 }

 crawlUrl = async(url) => {
 	try {
 		let res = await fetch(url, {mode: 'no-cors'});
 		let txt = await res.text();
 		let parser = new DOMParser();
 		let dom = parser.parseFromString(txt, "text/html");
 		let gatheredData = Object.fromEntries(crawlDom(selectors, dom));
 		gatheredData.url = url;
 		gatheredData.categories = getCats(dom, catRe);
 		return(gatheredData);
 	} catch(err) {
 		throw new Error(err); 
 	}
 }

 crawlUrls = (pages) => {
 	let o = new Output();
 	[...new Set(pages)].forEach((url, index)=>{
 		o.increment('pages_found');
 		o.timers.push(setTimeout( async () => {
 			try {
 				let d = await crawlUrl(url);
 				rowify(d).forEach(r=>o.append(r));
 				o.increment('pages_processed');
 			} catch (e) {
 				o.increment('errors');
 				o.log(e);
 				//throw new Error(e);				
 			}
 		}, index * waitInterval));
 	});
 	return o;
 }

 /* getting pages
 go to https://himss24.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
 switch to list mode, and keep clicking "load more" at the bottom until exhausted 
 in the console:
 pages = (Array.from(document.querySelectorAll('section#exhibitor-results table.results-table > tbody > tr.js-List > td:nth-child(2) a[href]'))).map(e=>e.getAttribute('href'))
 // then deduplicate
 pages = [...new Set(pages)];
 */

 // running crawler:
 // define urlBase then define pages array with urls relative to urlBase (eg: pages = ['/ex/pages/?id=3424', '/ex/pages/?id=5523'] etc.)
 // let o = crawlUrls(pages.map(url=>(urlBase + url)));
	let getElement, crawlDom, getCats, save, rowify; // functions
	let urlBase, pages; // input data

	// config
	let waitInterval = 500; // how much to wait between fetching pages

	// selectors
	let catRe = /\{\sproductcategories\s\:\s(\[\s\{[\s\S]\}\s\])\s*\}/m;
	let selectors = {
	'name' : '#showroomTopContentDiv .showroom-header h1',
	'desc' : '#scroll-description .showroom-about',
	'booth' : '#scroll-boothlinks #newfloorplanlink strong',
	};

	class Output {
	data =[];
	counters = {};
	timers = [];
	theLog = [];
	stop = () => {
	this.timers.forEach(t=>clearInterval(t));
	}
	increment = (counter) => {
	(this.counters[counter] === undefined) && (this.counters[counter]=0);
	this.counters[counter] ++;
	}
	log = (msg) => {
	this.theLog.push(msg);
	}
	append = (obj) => this.data.push(obj);
	get = () => this.data;
	}

	crawlDom = (selectors, dom) => Object.keys(selectors).map(k => [k, [...dom.querySelectorAll(selectors[k])].map(e => e.textContent.trim())]);

	getCats = (document, re) => {
	// extracts categories from source - from inside script tags
	let cat;
	let scriptTags = [...(document.querySelectorAll('script'))];
	let matches = scriptTags.map(e=>e.innerText).map(s=>s.match(re)).filter(m=>m!==null);
	if (matches.length > 0 && matches[0].length > 1) {
	eval("cat="+matches[0][1]);
	} else {
	cat = [null];
	}
	return cat;
	}

	rowify = (obj) => {
	// explodes parsed data into array with one entry for each subcategory
	// also flattens array data inside properties
	return obj.categories.map((cat) => ({
	url: obj.url,
	category: cat?.category,
	subcategory: cat?.subcategory,
	name: obj.name[0],
	desc: obj.desc[0],
	booth: obj.booth.join(', ')
	}));
	}

	save = (obj, filename='data.json') => {
	// save to downloadable file

	let data = JSON.stringify(obj, undefined, 4)

	var blob = new Blob([data], { type: 'text/json'}),
	e = document.createEvent('MouseEvents'),
	a = document.createElement('a')

	a.download = filename
	a.href = window.URL.createObjectURL(blob)
	a.dataset.downloadurl = ['text/json', a.download, a.href].join(':')
	e.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null)
	a.dispatchEvent(e)
	}

	crawlUrl = async(url) => {
	try {
	let res = await fetch(url, {mode: 'no-cors'});
	let txt = await res.text();
	let parser = new DOMParser();
	let dom = parser.parseFromString(txt, "text/html");
	let gatheredData = Object.fromEntries(crawlDom(selectors, dom));
	gatheredData.url = url;
	gatheredData.categories = getCats(dom, catRe);
	return(gatheredData);
	} catch(err) {
	throw new Error(err);
	}
	}

	crawlUrls = (pages) => {
	let o = new Output();
	[...new Set(pages)].forEach((url, index)=>{
	o.increment('pages_found');
	o.timers.push(setTimeout( async () => {
	try {
	let d = await crawlUrl(url);
	rowify(d).forEach(r=>o.append(r));
	o.increment('pages_processed');
	} catch (e) {
	o.increment('errors');
	o.log(e);
	//throw new Error(e);
	}
	}, index * waitInterval));
	});
	return o;
	}

	/* getting pages
	go to https://himss24.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
	switch to list mode, and keep clicking "load more" at the bottom until exhausted
	in the console:
	pages = (Array.from(document.querySelectorAll('section#exhibitor-results table.results-table > tbody > tr.js-List > td:nth-child(2) a[href]'))).map(e=>e.getAttribute('href'))
	// then deduplicate
	pages = [...new Set(pages)];
	*/

	// running crawler:
	// define urlBase then define pages array with urls relative to urlBase (eg: pages = ['/ex/pages/?id=3424', '/ex/pages/?id=5523'] etc.)
	// let o = crawlUrls(pages.map(url=>(urlBase + url)));