Created
July 7, 2024 17:56
-
-
Save suryakun/99b9f278daa4e594ef064dc6b8bf9be0 to your computer and use it in GitHub Desktop.
Sample of import pdf data with langchain
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import 'dotenv/config' | |
import 'pdf-parse' | |
import { z } from 'zod' | |
import { MemoryVectorStore } from 'langchain/vectorstores/memory'; | |
import { createStuffDocumentsChain } from 'langchain/chains/combine_documents'; | |
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; | |
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; | |
import { createRetrievalChain } from 'langchain/chains/retrieval'; | |
import { ChatPromptTemplate } from "@langchain/core/prompts"; | |
import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; | |
import { StructuredOutputParser } from 'langchain/output_parsers'; | |
import { GoogleGenerativeAIEmbeddings } from '@langchain/google-genai'; | |
const ollamaBaseUrl = "http://localhost:11435" | |
const ollamaModel = "llama3" | |
console.log(process.env.GOOGLE_APPLICATION_CREDENTIALS) | |
!async function() { | |
// Use GeminiPro | |
const llm = new ChatGoogleGenerativeAI({ | |
model: 'gemini-pro', | |
maxOutputTokens: 2048, | |
}) | |
const textSplitter = new RecursiveCharacterTextSplitter({ | |
chunkSize: 1000, | |
chunkOverlap: 200, | |
}) | |
const loader = new PDFLoader('./profile_linkedin.pdf') | |
const docs = await loader.load() | |
const splits = await textSplitter.splitDocuments(docs) | |
const vectorStore = await MemoryVectorStore.fromDocuments( | |
splits, | |
new GoogleGenerativeAIEmbeddings() | |
) | |
const retriever = vectorStore.asRetriever(); | |
const systemTemplate = 'You are the HR recruitment officer, you need to collect the data from applicant CV' | |
const schema = z.object({ | |
name: z.string().describe('name of applicant').optional().transform(val => val ?? ''), | |
phone: z.string().describe('phone number of applicant').optional().transform(val => val ?? ''), | |
experiences: z.array(z.object({ | |
company: z.string().describe('company name').optional().transform(val => val ?? ''), | |
time: z.string().describe('work time year start and year end').optional().transform(val => val ?? ''), | |
title: z.string().describe('applicant job title').optional().transform(val => val ?? ''), | |
jobDescription: z.string().describe('detail of the job, just put empty string if you have no answer').optional().nullable().or(z.literal('')) | |
})).describe('Detail of applicant experience').transform(arr => arr ?? []) | |
}) | |
const parser = StructuredOutputParser.fromZodSchema(schema) | |
const prompt = ChatPromptTemplate.fromMessages([ | |
['system', systemTemplate], | |
['human', '{input} {context} {format_instructions}'] | |
]) | |
const questionAnswerChain = await createStuffDocumentsChain({ llm, prompt, outputParser: parser }) | |
const ragChain = await createRetrievalChain({ | |
retriever, | |
combineDocsChain: questionAnswerChain, | |
}) | |
const results = await ragChain.invoke({ | |
input: 'what the data that you get? make the answer as text without markdown tag', | |
format_instructions: parser.getFormatInstructions() | |
}) | |
console.log(JSON.stringify(results.answer)) | |
}() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment