Created
July 29, 2010 21:27
-
-
Save anonymous/499291 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"translatorID":"207f4aad-b604-43ef-a7f5-3e6229aade9f","label":"New Zealand Herald","creator":"Sopheak Hean (University of Waikato, Faculty of Education, New Zealand)","target":"www.nzherald.co.nz","minVersion":"1.0","maxVersion":"","priority":100,"inRepository":"1","translatorType":4,"lastUpdated":"2010-08-03 15:31:36"} | |
/* Big thanks to | |
* Michael Collins, | |
* Frank Bennett and | |
* Avram Lyon | |
* for their help with optimising the code */ | |
function detectWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == "x" ) return namespace; else return null; | |
} : null; | |
/* If the address bar has /news in it then it's a newspapers article*/ | |
if (doc.location.href.indexOf("/search/results.cfm") !=-1){ | |
return "multiple"; | |
} else if (doc.location.href.indexOf("/news/article.cfm") !=-1){ | |
return "newspaperArticle"; | |
} | |
} | |
function associateData (newItem, items, field, zoteroField) { | |
if (items[field]){ | |
newItem[zoteroField] = items[field]; | |
} | |
} | |
function scrape(doc, url){ | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articleLanguage = "English"; | |
var newItem = new Zotero.Item('newspaperArticle'); | |
newItem.url = doc.location.href; | |
newItem.title = "No Title Found"; | |
newItem.publicationTitle = "New Zealand Herald"; | |
newItem.ISSN = "1170-0777"; | |
//Get title of the news via xpath | |
var myXPath = '//h1'; | |
var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
//Get news title | |
headers =myXPathObject; | |
newItem.title = headers; | |
var headers; | |
var items = new Object(); | |
var authorsTemp; | |
var blankCell; | |
var contents; | |
var authorArray = new Array(); | |
/* | |
//Get authors of the article | |
Remove "By " then replace "and " with ", " | |
Put the string into an array then split the array and loop all authors then push author to Zotero. Possible with more than 1 author on an article. | |
*/ | |
try { | |
//Try this path to author if there is nothing wrong the proceed | |
//but if not then puke out some errors and catch it to prevent mass destruction. | |
var authorXPath = '//span[@class="credits"]'; | |
var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); | |
if (authorXPathObject) { | |
var authorString = authorXPathObject.textContent.replace(/\bBy\W+/g, ''); | |
if (authorString.match(/\W\band\W+/g)){ | |
authorTemp = authorString.replace(/\W\band\W+/g, ', '); | |
authorArray = authorTemp.split(", "); | |
} else if (!authorString.match(/\W\band\W+/g)) | |
{ | |
authorArray = authorString; | |
} | |
if( authorArray instanceof Array ) { | |
for (var i in authorArray){ | |
var author; | |
author = authorArray[i]; | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author")); | |
} | |
} else { | |
if (authorString.match(/\W\bof\W+/g)){ | |
authorTemp = authorString.replace (/\W\bof\W(.*)/g, ''); | |
authorArray = authorTemp; | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author")); | |
} else { | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author")); | |
} | |
} | |
} else { | |
newItem.creators =""; | |
} | |
} catch (errs){ | |
//if the span class credits doesn't exist or something gone really wrong in the authorXPath argument then | |
//it is either the xpath to author element is broken or doesn't exist. | |
//Blank the Author to stop the desctructions and to keep the translator alive. | |
newItem.creators=""; //give creator value to Nothing. | |
} | |
//date-Year | |
var dateXPath = '//div[@class="tools"]/span'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
//If the original Xpath1 is equal to Updated then go to XPath2 | |
if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){ | |
var dateXPath = '//div[@class="tools"]/span[2]'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
newItem.date = dateXPathObject ; | |
} | |
else{ //great found the date just push it to Zotero. | |
var dateXPath = '//div[@class="tools"]/span'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
newItem.date = dateXPathObject ; | |
} | |
//Get Section of the news | |
var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]'; | |
var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = sectionXPathObject; | |
newItem.language= articleLanguage; | |
//grab abstract from meta data | |
var a= "//meta[@name='description']"; | |
newItem.abstractNote = doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.complete(); | |
/* These doing nothing but leaving it here just in case and to look pretty | |
associateData (newItem, items, "Language:", "language"); | |
associateData (newItem, items, "Section:", "section"); | |
associateData (newItem, items, "Abstract:", "abstract"); | |
associateData (newItem, items, "Author:", "author"); | |
*/ | |
} | |
function doWeb(doc, url){ | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix){ | |
if (prefix =='x') | |
return namespace; else return null; | |
} :null; | |
var articles = new Array(); | |
var items = new Object(); | |
var nextTitle; | |
if (detectWeb(doc, url) == "multiple"){ | |
var titles = doc.evaluate('//p[@class="results"]/a', doc, nsResolver, XPathResult.ANY_TYPE, null); | |
while (nextTitle = titles.iterateNext()){ | |
items[nextTitle.href] = nextTitle.textContent; | |
} | |
items= Zotero.selectItems(items); | |
for (var i in items){ | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
//Zotero.Util only works when scrape function is declared | |
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();}); | |
Zotero.wait(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment