Last active
April 10, 2025 02:43
-
-
Save yogeshvar/6f899a4fd055edcba5e628a083c32056 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.docstore.document import Document | |
from langchain.embeddings.base import Embeddings | |
import numpy as np | |
textract = boto3.client("textract") | |
s3_bucket = "your-s3-bucket" | |
pdf_files = [ | |
"court1.pdf", | |
"court2.pdf", | |
# add more here | |
] | |
def extract_text(bucket, key): | |
response = textract.analyze_document( | |
Document={'S3Object': {'Bucket': bucket, 'Name': key}}, | |
FeatureTypes=["TEXT_DETECTION"] | |
) | |
lines = [block["Text"] for block in response["Blocks"] if block["BlockType"] == "LINE"] | |
return "\n".join(lines) | |
with open("combined.txt", "w", encoding="utf-8") as f: | |
for file in pdf_files: | |
text = extract_text(s3_bucket, file) | |
f.write(f"\n--- {file} START ---\n") | |
f.write(text) | |
f.write(f"\n--- {file} END ---\n\n") | |
# 1. Load and split text | |
with open("combined.txt", "r", encoding="utf-8") as f: | |
raw_text = f.read() | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
docs = splitter.create_documents([raw_text]) | |
# 2. Embed using Bedrock Titan via boto3 | |
bedrock = boto3.client("bedrock-runtime") | |
class BedrockBoto3Embeddings(Embeddings): | |
def embed_documents(self, texts): | |
embeddings = [] | |
for text in texts: | |
response = bedrock.invoke_model( | |
modelId="amazon.titan-embed-text-v1", | |
contentType="application/json", | |
accept="application/json", | |
body=json.dumps({"inputText": text}) | |
) | |
result = json.loads(response["body"].read()) | |
embeddings.append(result["embedding"]) | |
return embeddings | |
# 3. Create FAISS index | |
embedder = BedrockBoto3Embeddings() | |
# vectorstore = FAISS.from_documents(docs, embedder) | |
vectorstore = Chroma.from_documents(docs, embedder, persist_directory="./chroma_store") | |
def answer_question(query): | |
# 1. Embed the question | |
response = bedrock.invoke_model( | |
modelId="amazon.titan-embed-text-v1", | |
contentType="application/json", | |
accept="application/json", | |
body=json.dumps({"inputText": query}) | |
) | |
query_vector = json.loads(response["body"].read())["embedding"] | |
# 2. Search top-k chunks | |
docs_and_scores = vectorstore.similarity_search_by_vector(query_vector, k=5) | |
context = "\n\n".join(doc.page_content for doc in docs_and_scores) | |
# 3. Generate answer with Claude | |
prompt = f""" | |
You are a legal assistant. Based on the following court document excerpts, answer the question. | |
Context: | |
{context} | |
Question: {query} | |
Answer:""" | |
response = bedrock.invoke_model( | |
modelId="anthropic.claude-v2", | |
contentType="application/json", | |
accept="application/json", | |
body=json.dumps({ | |
"prompt": prompt, | |
"max_tokens_to_sample": 500, | |
"temperature": 0.3, | |
"top_k": 250, | |
"top_p": 0.9 | |
}) | |
) | |
result = json.loads(response["body"].read()) | |
return result["completion"] | |
# Example | |
print(answer_question("What was the court's decision in court1.pdf?")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment