Skip to content

Instantly share code, notes, and snippets.

@yogeshvar
Last active April 10, 2025 02:43
Show Gist options
  • Save yogeshvar/6f899a4fd055edcba5e628a083c32056 to your computer and use it in GitHub Desktop.
Save yogeshvar/6f899a4fd055edcba5e628a083c32056 to your computer and use it in GitHub Desktop.
import boto3
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
import numpy as np
textract = boto3.client("textract")
s3_bucket = "your-s3-bucket"
pdf_files = [
"court1.pdf",
"court2.pdf",
# add more here
]
def extract_text(bucket, key):
response = textract.analyze_document(
Document={'S3Object': {'Bucket': bucket, 'Name': key}},
FeatureTypes=["TEXT_DETECTION"]
)
lines = [block["Text"] for block in response["Blocks"] if block["BlockType"] == "LINE"]
return "\n".join(lines)
with open("combined.txt", "w", encoding="utf-8") as f:
for file in pdf_files:
text = extract_text(s3_bucket, file)
f.write(f"\n--- {file} START ---\n")
f.write(text)
f.write(f"\n--- {file} END ---\n\n")
# 1. Load and split text
with open("combined.txt", "r", encoding="utf-8") as f:
raw_text = f.read()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.create_documents([raw_text])
# 2. Embed using Bedrock Titan via boto3
bedrock = boto3.client("bedrock-runtime")
class BedrockBoto3Embeddings(Embeddings):
def embed_documents(self, texts):
embeddings = []
for text in texts:
response = bedrock.invoke_model(
modelId="amazon.titan-embed-text-v1",
contentType="application/json",
accept="application/json",
body=json.dumps({"inputText": text})
)
result = json.loads(response["body"].read())
embeddings.append(result["embedding"])
return embeddings
# 3. Create FAISS index
embedder = BedrockBoto3Embeddings()
# vectorstore = FAISS.from_documents(docs, embedder)
vectorstore = Chroma.from_documents(docs, embedder, persist_directory="./chroma_store")
def answer_question(query):
# 1. Embed the question
response = bedrock.invoke_model(
modelId="amazon.titan-embed-text-v1",
contentType="application/json",
accept="application/json",
body=json.dumps({"inputText": query})
)
query_vector = json.loads(response["body"].read())["embedding"]
# 2. Search top-k chunks
docs_and_scores = vectorstore.similarity_search_by_vector(query_vector, k=5)
context = "\n\n".join(doc.page_content for doc in docs_and_scores)
# 3. Generate answer with Claude
prompt = f"""
You are a legal assistant. Based on the following court document excerpts, answer the question.
Context:
{context}
Question: {query}
Answer:"""
response = bedrock.invoke_model(
modelId="anthropic.claude-v2",
contentType="application/json",
accept="application/json",
body=json.dumps({
"prompt": prompt,
"max_tokens_to_sample": 500,
"temperature": 0.3,
"top_k": 250,
"top_p": 0.9
})
)
result = json.loads(response["body"].read())
return result["completion"]
# Example
print(answer_question("What was the court's decision in court1.pdf?"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment