Created
June 5, 2025 14:55
-
-
Save CharlyWargnier/d600a8fce4839a858d2b08404cbc534e to your computer and use it in GitHub Desktop.
Hacker News Auto Digest - powered by StageHand!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Stagehand } from "@browserbasehq/stagehand"; | |
import fs from "fs"; | |
import path from "path"; | |
import Tesseract from "tesseract.js"; | |
import { OpenAI } from "openai"; | |
import dotenv from "dotenv"; | |
dotenv.config(); | |
const openai = new OpenAI({ | |
apiKey: process.env.OPENAI_API_KEY, | |
}); | |
async function extractTextFromImage(imagePath: string) { | |
console.log(`π Running OCR on: ${imagePath}`); | |
const result = await Tesseract.recognize(imagePath, "eng", { | |
logger: (m) => console.log(m.status), | |
}); | |
return result.data.text; | |
} | |
async function summarizeWithGPT(text: string): Promise<string> { | |
console.log("π‘ Sending first 300 words of OCR text to GPT-4 for summarization..."); | |
const first300Words = text.split(/\s+/).slice(0, 300).join(" "); | |
const response = await openai.chat.completions.create({ | |
model: "gpt-4o-mini", | |
messages: [ | |
{ | |
role: "system", | |
content: "You summarize webpage content based on OCR extracted text.", | |
}, | |
{ | |
role: "user", | |
content: `Please summarize this text:\n\n${first300Words}`, | |
}, | |
], | |
temperature: 0.7, | |
}); | |
return response.choices[0].message.content || ""; | |
} | |
async function main() { | |
console.log("π Starting Stagehand..."); | |
const stagehand = new Stagehand({ | |
env: "LOCAL", | |
headless: false, | |
}); | |
try { | |
await stagehand.init(); | |
console.log("β Stagehand initialized"); | |
// π§Ή Remove accidental root-level 'Texts' folder if it exists | |
const rogueTextsFolder = path.join(process.cwd(), "Texts"); | |
if (fs.existsSync(rogueTextsFolder)) { | |
fs.rmSync(rogueTextsFolder, { recursive: true, force: true }); | |
console.log("ποΈ Removed rogue 'Texts' folder from root directory."); | |
} | |
// Folder structure | |
const baseFolder = path.join(process.cwd(), "Files"); | |
const screenshotsFolder = path.join(baseFolder, "Screenshots"); | |
const textsFolder = path.join(baseFolder, "Texts"); | |
const summariesFolder = path.join(baseFolder, "Summaries"); | |
const summariesTextsFolder = path.join(summariesFolder, "Texts"); | |
[baseFolder, screenshotsFolder, textsFolder, summariesFolder, summariesTextsFolder].forEach((folder) => { | |
if (!fs.existsSync(folder)) { | |
fs.mkdirSync(folder); | |
console.log(`π Created folder: ${folder}`); | |
} | |
}); | |
console.log("π Navigating to Hacker News..."); | |
await stagehand.page.goto("https://news.ycombinator.com/"); | |
await stagehand.page.waitForLoadState("domcontentloaded"); | |
const targetIndices = [1, 2]; | |
for (const index of targetIndices) { | |
console.log(`π Locating article link #${index + 1}...`); | |
const link = stagehand.page.locator(".titleline > a").nth(index); | |
await link.waitFor({ timeout: 5000 }); | |
console.log(`π Clicking article #${index + 1}...`); | |
await Promise.all([ | |
stagehand.page.waitForNavigation(), | |
link.click(), | |
]); | |
const title = await stagehand.page.title(); | |
console.log(`π Page title: "${title}"`); | |
const screenshotName = `hackernews_link_${index + 1}.png`; | |
const screenshotPath = path.join(screenshotsFolder, screenshotName); | |
await stagehand.page.screenshot({ path: screenshotPath, fullPage: true }); | |
console.log(`πΈ Screenshot saved at: ${screenshotPath}`); | |
const ocrText = await extractTextFromImage(screenshotPath); | |
const textFilePath = path.join(textsFolder, `hackernews_link_${index + 1}.txt`); | |
fs.writeFileSync(textFilePath, ocrText); | |
console.log(`π OCR text saved to: ${textFilePath}`); | |
const summary = await summarizeWithGPT(ocrText); | |
const summaryPath = path.join(summariesTextsFolder, `hackernews_link_${index + 1}_summary.txt`); | |
fs.writeFileSync(summaryPath, summary); | |
console.log(`π§ Summary saved to: ${summaryPath}`); | |
console.log("β©οΈ Returning to Hacker News..."); | |
await stagehand.page.goBack(); | |
await stagehand.page.waitForLoadState("domcontentloaded"); | |
} | |
// Merge summaries | |
console.log("π¦ Merging summaries into MD, JSON, CSV..."); | |
const summaryFiles = fs.readdirSync(summariesTextsFolder).filter(file => file.endsWith(".txt")); | |
const markdownOutput: string[] = []; | |
const jsonOutput: Record<string, string> = {}; | |
const csvOutput: string[] = ["id,summary"]; | |
for (const file of summaryFiles) { | |
const id = path.basename(file, ".txt"); | |
const content = fs.readFileSync(path.join(summariesTextsFolder, file), "utf-8").trim(); | |
markdownOutput.push(`### ${id}\n\n${content}\n`); | |
jsonOutput[id] = content; | |
csvOutput.push(`"${id}","${content.replace(/"/g, "'")}"`); | |
} | |
fs.writeFileSync(path.join(summariesFolder, "summaries.md"), markdownOutput.join("\n---\n")); | |
fs.writeFileSync(path.join(summariesFolder, "summaries.json"), JSON.stringify(jsonOutput, null, 2)); | |
fs.writeFileSync(path.join(summariesFolder, "summaries.csv"), csvOutput.join("\n")); | |
console.log("β summaries.md, .json, .csv created."); | |
await stagehand.page.waitForTimeout(3000); | |
} catch (err) { | |
console.error("β Error:", err); | |
await stagehand.page.waitForTimeout(3000); | |
} finally { | |
await stagehand.close(); | |
console.log("π Browser closed β automation complete!"); | |
} | |
} | |
main().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment