itsanishjain · November 27, 2024 13:22
diff --git a/idea-output.txt b/idea-output.txt
 - Idea Name: Cybersecurity for AI Companies
 - Idea Title: High-Level Security for AI Weights
 - Description: This idea focuses on providing cybersecurity solutions for AI companies to protect their valuable digital assets from theft or espionage. The proposed solution is to build high-level security for AI weights, similar to Chainalysis but for AI.
 - Categories: Cybersecurity, Artificial Intelligence
 - Industry: Technology, Security
 - Tool Suggestions to build: AI-based cybersecurity tools
 - MVP Plan: Develop a prototype for AI weight security, test it with a few AI companies, and iterate based on feedback.

 - Idea Name: Moon-Focused Space Startups
 - Idea Title: Moon Tourism, Mining, and Asset Placement
 - Description: This idea explores the opportunities in moon-focused space startups, emphasizing the potential for moon tourism, mining, and asset placement, leveraging SpaceX's falling launch costs.
 - Categories: Space Exploration, Tourism, Mining
 - Industry: Aerospace
 - Tool Suggestions to build: Spacecrafts, Mining equipment
 - MVP Plan: Partner with SpaceX or similar companies to test the feasibility of moon tourism or mining.

 - Idea Name: Kid-Friendly Smartphone Alternatives
 - Idea Title: Limited-Feature Phone for Kids
 - Description: This idea proposes a kid-friendly smartphone alternative that caters to parents' desire for device control and kids' need for connectivity, offering a limited-feature phone with basic communication and learning tools.
 - Categories: Technology, Education
 - Industry: Consumer Electronics
 - Tool Suggestions to build: Limited-feature smartphone
 - MVP Plan: Design and develop a prototype, test it with a small group of kids and parents, and iterate based on feedback.

 - Idea Name: Humanoid Military Robots
 ...
 - Categories: Product Design, Consumer Goods
 - Industry: Manufacturing, Retail
 - Tool Suggestions to build: Design software, 3D printers
 - MVP Plan: Design a few products with fun and quirky elements, test them in the market, and iterate based on feedback.
 Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
diff --git a/scrape.py b/scrape.py
 from dotenv import load_dotenv
 import os
 from langchain_community.document_loaders import TextLoader
 from langchain import OpenAI
 from langchain import PromptTemplate

 load_dotenv()
 openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey')

 # Load the text file
 loader = TextLoader("4 $1 Billion Startup Ideas.text")
 pages = loader.load()

 # Combine the pages, and replace the tabs with spaces
 text = ""

 for page in pages:
    text += page.page_content
    
 text = text.replace('\t', ' ')

 llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

 num_tokens = llm.get_num_tokens(text)
 print (f"This book has {num_tokens} tokens in it")

 # Loaders
 from langchain.schema import Document

 # Splitters
 from langchain.text_splitter import RecursiveCharacterTextSplitter

 # Model
 from langchain.chat_models import ChatOpenAI

 # Embedding Support
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings

 # Summarizer we'll use for Map Reduce
 from langchain.chains.summarize import load_summarize_chain

 # Data Science
 import numpy as np
 from sklearn.cluster import KMeans


 text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)

 docs = text_splitter.create_documents([text])

 num_documents = len(docs)

 print (f"Now our book is split up into {num_documents} documents")

 embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

 vectors = embeddings.embed_documents([x.page_content for x in docs])

 # Assuming 'embeddings' is a list or array of 1536-dimensional embeddings

 # Choose the number of clusters, this can be adjusted based on the book's content.
 # I played around and found ~10 was the best.
 # Usually if you have 10 passages from a book you can tell what it's about
 num_clusters = 4

 # Perform K-means clustering
 kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

 # Find the closest embeddings to the centroids

 # Create an empty list that will hold your closest points
 closest_indices = []

 # Loop through the number of clusters you have
 for i in range(num_clusters):
    
    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
    
    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)
    
    # Append that position to your closest indices list
    closest_indices.append(closest_index)

 selected_indices = sorted(closest_indices)

 llm3 = ChatOpenAI(temperature=0,
                 openai_api_key=openai_api_key,
                 max_tokens=1000,
                 model='gpt-3.5-turbo'
                )

 map_prompt = """
 You will be given a single passage of a book. This section will be enclosed in triple backticks (```)
 Your goal is to give a summary of this section so that a reader will have a full understanding of what happened.
 Your response should be at least three paragraphs and fully encompass what was said in the passage.

 ```{text}```
 FULL SUMMARY:
 """
 map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])


 map_chain = load_summarize_chain(llm=llm3,
                             chain_type="stuff",
                             prompt=map_prompt_template)

 selected_docs = [docs[doc] for doc in selected_indices]

 # Make an empty list to hold your summaries
 summary_list = []

 # Loop through a range of the lenght of your selected docs
 for i, doc in enumerate(selected_docs):
    
    # Go get a summary of the chunk
    chunk_summary = map_chain.run([doc])
    
    # Append that summary to your list
    summary_list.append(chunk_summary)
    
    print (f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {chunk_summary[:250]} \n")


 summaries = "\n".join(summary_list)

 # Convert it back to a document
 summaries = Document(page_content=summaries)

 print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")

 llm4 = ChatOpenAI(temperature=0,
                 openai_api_key=openai_api_key,
                 max_tokens=3000,
                 model='gpt-4',
                 request_timeout=120
                )

 extract_prompt = """
 You will be given a transcription of a podcast where smart business professionals discuss innovative business ideas, strategies, and niche stories. 
 Your goal is to extract structured information that highlights the core business ideas and plans discussed. Ensure the output is concise and accurate, summarizing key actionable insights.

 The transcription will be enclosed in triple backticks (```).
 Use the following format for your response:

 - Idea Name:
 - Idea Title:
 - Description:
 - Categories:
 - Industry:
 - Tool Suggestions to build:
 - MVP Plan:

 ```{text}```
 STRUCTURED INFORMATION:
 """
 extract_prompt_template = PromptTemplate(template=extract_prompt, input_variables=["text"])

 reduce_chain = load_summarize_chain(llm=llm4,
                             chain_type="stuff",
                             prompt=extract_prompt_template,
 #                              verbose=True # Set this to true if you want to see the inner workings
                                   )

 output = reduce_chain.run([summaries])
 print (output)

 ### Source langchain-tutorials
	- Idea Name: Cybersecurity for AI Companies
	- Idea Title: High-Level Security for AI Weights
	- Description: This idea focuses on providing cybersecurity solutions for AI companies to protect their valuable digital assets from theft or espionage. The proposed solution is to build high-level security for AI weights, similar to Chainalysis but for AI.
	- Categories: Cybersecurity, Artificial Intelligence
	- Industry: Technology, Security
	- Tool Suggestions to build: AI-based cybersecurity tools
	- MVP Plan: Develop a prototype for AI weight security, test it with a few AI companies, and iterate based on feedback.

	- Idea Name: Moon-Focused Space Startups
	- Idea Title: Moon Tourism, Mining, and Asset Placement
	- Description: This idea explores the opportunities in moon-focused space startups, emphasizing the potential for moon tourism, mining, and asset placement, leveraging SpaceX's falling launch costs.
	- Categories: Space Exploration, Tourism, Mining
	- Industry: Aerospace
	- Tool Suggestions to build: Spacecrafts, Mining equipment
	- MVP Plan: Partner with SpaceX or similar companies to test the feasibility of moon tourism or mining.

	- Idea Name: Kid-Friendly Smartphone Alternatives
	- Idea Title: Limited-Feature Phone for Kids
	- Description: This idea proposes a kid-friendly smartphone alternative that caters to parents' desire for device control and kids' need for connectivity, offering a limited-feature phone with basic communication and learning tools.
	- Categories: Technology, Education
	- Industry: Consumer Electronics
	- Tool Suggestions to build: Limited-feature smartphone
	- MVP Plan: Design and develop a prototype, test it with a small group of kids and parents, and iterate based on feedback.

	- Idea Name: Humanoid Military Robots
	...
	- Categories: Product Design, Consumer Goods
	- Industry: Manufacturing, Retail
	- Tool Suggestions to build: Design software, 3D printers
	- MVP Plan: Design a few products with fun and quirky elements, test them in the market, and iterate based on feedback.
	Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
	from dotenv import load_dotenv
	import os
	from langchain_community.document_loaders import TextLoader
	from langchain import OpenAI
	from langchain import PromptTemplate

	load_dotenv()
	openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey')

	# Load the text file
	loader = TextLoader("4 $1 Billion Startup Ideas.text")
	pages = loader.load()

	# Combine the pages, and replace the tabs with spaces
	text = ""

	for page in pages:
	text += page.page_content

	text = text.replace('\t', ' ')

	llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

	num_tokens = llm.get_num_tokens(text)
	print (f"This book has {num_tokens} tokens in it")

	# Loaders
	from langchain.schema import Document

	# Splitters
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Model
	from langchain.chat_models import ChatOpenAI

	# Embedding Support
	from langchain.vectorstores import FAISS
	from langchain.embeddings import OpenAIEmbeddings

	# Summarizer we'll use for Map Reduce
	from langchain.chains.summarize import load_summarize_chain

	# Data Science
	import numpy as np
	from sklearn.cluster import KMeans


	text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)

	docs = text_splitter.create_documents([text])

	num_documents = len(docs)

	print (f"Now our book is split up into {num_documents} documents")

	embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

	vectors = embeddings.embed_documents([x.page_content for x in docs])

	# Assuming 'embeddings' is a list or array of 1536-dimensional embeddings

	# Choose the number of clusters, this can be adjusted based on the book's content.
	# I played around and found ~10 was the best.
	# Usually if you have 10 passages from a book you can tell what it's about
	num_clusters = 4

	# Perform K-means clustering
	kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

	# Find the closest embeddings to the centroids

	# Create an empty list that will hold your closest points
	closest_indices = []

	# Loop through the number of clusters you have
	for i in range(num_clusters):

	# Get the list of distances from that particular cluster center
	distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)

	# Find the list position of the closest one (using argmin to find the smallest distance)
	closest_index = np.argmin(distances)

	# Append that position to your closest indices list
	closest_indices.append(closest_index)

	selected_indices = sorted(closest_indices)

	llm3 = ChatOpenAI(temperature=0,
	openai_api_key=openai_api_key,
	max_tokens=1000,
	model='gpt-3.5-turbo'
	)

	map_prompt = """
	You will be given a single passage of a book. This section will be enclosed in triple backticks (```)
	Your goal is to give a summary of this section so that a reader will have a full understanding of what happened.
	Your response should be at least three paragraphs and fully encompass what was said in the passage.

	```{text}```
	FULL SUMMARY:
	"""
	map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])


	map_chain = load_summarize_chain(llm=llm3,
	chain_type="stuff",
	prompt=map_prompt_template)

	selected_docs = [docs[doc] for doc in selected_indices]

	# Make an empty list to hold your summaries
	summary_list = []

	# Loop through a range of the lenght of your selected docs
	for i, doc in enumerate(selected_docs):

	# Go get a summary of the chunk
	chunk_summary = map_chain.run([doc])

	# Append that summary to your list
	summary_list.append(chunk_summary)

	print (f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {chunk_summary[:250]} \n")


	summaries = "\n".join(summary_list)

	# Convert it back to a document
	summaries = Document(page_content=summaries)

	print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")

	llm4 = ChatOpenAI(temperature=0,
	openai_api_key=openai_api_key,
	max_tokens=3000,
	model='gpt-4',
	request_timeout=120
	)

	extract_prompt = """
	You will be given a transcription of a podcast where smart business professionals discuss innovative business ideas, strategies, and niche stories.
	Your goal is to extract structured information that highlights the core business ideas and plans discussed. Ensure the output is concise and accurate, summarizing key actionable insights.

	The transcription will be enclosed in triple backticks (```).
	Use the following format for your response:

	- Idea Name:
	- Idea Title:
	- Description:
	- Categories:
	- Industry:
	- Tool Suggestions to build:
	- MVP Plan:

	```{text}```
	STRUCTURED INFORMATION:
	"""
	extract_prompt_template = PromptTemplate(template=extract_prompt, input_variables=["text"])

	reduce_chain = load_summarize_chain(llm=llm4,
	chain_type="stuff",
	prompt=extract_prompt_template,
	# verbose=True # Set this to true if you want to see the inner workings
	)

	output = reduce_chain.run([summaries])
	print (output)

	### Source langchain-tutorials