Created
June 23, 2023 07:19
-
-
Save natzir/473d9731087c9f82d0cd9ec4e27d69c2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var depth = parseInt(prompt("Please enter the scraping depth (we recommend between 1 to 3):", "2")); | |
if(!Number.isInteger(depth) || depth < 1) { | |
alert("Invalid depth! Setting depth to default (2)."); | |
depth = 2; | |
} | |
var count = 1; | |
async function start() { | |
var elements = document.querySelectorAll('div[data-lk]'); | |
for(let element of elements){ | |
await new Promise(resolve => setTimeout(resolve, 1500)); | |
element.parentElement.firstChild.firstChild.childNodes[1].firstChild.click(); | |
} | |
} | |
async function scrollPage() { | |
var lastHeight = document.body.scrollHeight; | |
window.scrollTo(0, lastHeight); | |
await new Promise(resolve => setTimeout(resolve, 1500)); | |
if (lastHeight < document.body.scrollHeight) { | |
scrollPage(); | |
} else if (count < depth) { | |
count++; | |
await start(); | |
scrollPage(); | |
} else { | |
await new Promise(resolve => setTimeout(resolve, 5000)); | |
extractData(); | |
document.getElementById("loading-overlay").remove(); | |
} | |
} | |
function extractData() { | |
var questions = document.querySelectorAll('div[data-lk] span.CSkcDe'); | |
var answers = document.querySelectorAll('div[data-lk] div[data-md="61"], div[data-lk] div[data-md="83"]'); | |
var urls = document.querySelectorAll('div[data-lk] div.yuRUbf a'); | |
var uniqueQuestions = {}; | |
var csvContent = 'Question,Answer,URL,Type\n'; | |
questions.forEach(function(question, i) { | |
var urlText = urls[i] ? urls[i].href.split('#:~:')[0] : ''; | |
var questionText = question.textContent.replace(/,/g, '').replace(/"/g, '\"\"'); | |
var answerText = answers[i] ? answers[i].textContent.replace(/,/g, '').replace(/"/g, '\"\"') : ''; | |
var type = answers[i] && (answers[i].getAttribute('data-md') === '83') ? 'List' : 'Paragraph'; | |
if (urlText && !urlText.includes('webcache.googleusercontent.com')) { | |
uniqueQuestions[questionText] = [answerText, urlText, type]; | |
} | |
}); | |
var tableHTML = '<style>' + | |
'body { font-family: Arial, sans-serif; background-color: #f3f3f3; margin: 20px; }' + | |
'h1 { text-align: center; }' + | |
'p { margin-bottom: 20px; }' + | |
'table { width: 100%; border-collapse: collapse; background-color: #fff; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); }' + | |
'thead th { background-color: #f5f5f5; text-align: left; padding: 8px; }' + | |
'tbody td { padding: 8px; }' + | |
'a.button { display: inline-block; margin-top: 20px; margin-bottom: 40px; padding: 10px 15px; background-color: #4caf50; color: #fff; text-decoration: none; border-radius: 4px; cursor: pointer; }' + | |
'</style>' + | |
'<head><title>PAA Extractor by Natzir</title></head>' + | |
'<h1>PAA Extractor by Natzir</h1>' + | |
'<p>This code scrapes the People Also Ask (PAA) section from Google SERP. <br><br>The "Type" column in the table indicates whether the PAA entry is a list or a paragraph.To provide a cleaner result, duplicate questions are removed from the output.</p>' + | |
'<p>Follow me on <a href="https://twitter.com/natzir9">@natzir9</a> for more updates.</p>' + | |
'<a href="#" id="download-button" class="button">Download PAA</a>' + | |
'<table><thead><tr><th>Question</th><th>Answer</th><th>URL</th><th>Type</th></tr></thead><tbody>'; | |
for (var question in uniqueQuestions) { | |
tableHTML += '<tr><td>' + question + '</td><td>' + uniqueQuestions[question][0] + '</td><td>' + uniqueQuestions[question][1] + '</td><td>' + uniqueQuestions[question][2] + '</td></tr>'; | |
csvContent += '"' + question + '","' + uniqueQuestions[question][0] + '","' + uniqueQuestions[question][1] + '","' + uniqueQuestions[question][2] + '"\n'; | |
} | |
tableHTML += '</tbody></table>'; | |
var newWindow = window.open("", "_blank"); | |
newWindow.document.write(tableHTML); | |
var button = newWindow.document.getElementById("download-button"); | |
var file = new Blob([csvContent], {type: 'text/csv'}); | |
button.href = URL.createObjectURL(file); | |
button.download = 'PAA.csv'; | |
} | |
var style = document.createElement('style'); | |
style.type = 'text/css'; | |
style.innerHTML = '@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}'; | |
document.getElementsByTagName('head')[0].appendChild(style); | |
var loadingOverlay = document.createElement("div"); | |
loadingOverlay.id = "loading-overlay"; | |
loadingOverlay.style.position = "fixed"; | |
loadingOverlay.style.top = "0"; | |
loadingOverlay.style.left = "0"; | |
loadingOverlay.style.width = "100%"; | |
loadingOverlay.style.height = "100%"; | |
loadingOverlay.style.background = "rgba(0, 0, 0, 0.5)"; | |
loadingOverlay.style.display = "flex"; | |
loadingOverlay.style.justifyContent = "center"; | |
loadingOverlay.style.alignItems = "center"; | |
loadingOverlay.style.zIndex = "10000"; | |
var spinner = document.createElement("div"); | |
spinner.style.border = "16px solid #f3f3f3"; | |
spinner.style.borderRadius = "50%"; | |
spinner.style.borderTop = "16px solid #3498db"; | |
spinner.style.width = "120px"; | |
spinner.style.height = "120px"; | |
spinner.style.animation = "spin 2s linear infinite"; | |
loadingOverlay.appendChild(spinner); | |
document.body.appendChild(loadingOverlay); | |
start(); | |
scrollPage(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for sharing it. For those seeking research papers for sale, a convenient option is available at https://studyclerk.com/research-paper-for-sale this site. This website provides an easy process to acquire research papers quickly and efficiently. I have personally found this link useful whenever I need to hire an essay writer. The platform offers a variety of research papers for purchase, making it a valuable resource for students and researchers who require well-crafted papers on specific subjects.