Skip to content

Instantly share code, notes, and snippets.

@RageshAntonyHM
Last active February 14, 2025 08:00
Show Gist options
  • Save RageshAntonyHM/81b2f9f117c9b74dce6d68b710397a95 to your computer and use it in GitHub Desktop.
Save RageshAntonyHM/81b2f9f117c9b74dce6d68b710397a95 to your computer and use it in GitHub Desktop.
Tamil Scrapper
import json
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
def get_naaladiyar_data(page_num):
"""
Given an integer page_num (1..40), build the URL, scrape, and return:
section, sub_section, title, results
where results is a list of:
{
"number": verse_no,
"song": verse_text,
"meaning": explanation
}
"""
# Build page number as 01, 02, ... 40
page_str = f"{page_num:02d}" # zero-padded
url = f"http://www.sangathamizh.com/18keezh-kanakku/naaladiyar/naaladiyar-நாலடியார்{page_str}.html"
response = requests.get(url)
response.encoding = "utf-8" # ensure correct decoding
soup = BeautifulSoup(response.text, "html.parser")
# Extract top headings:
# <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
# <div class="head-title"><h1>துறவறவியல்</h1></div> (sometimes you may not need this)
# <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
section_div = soup.select_one("div.head-back-grnd h1")
sub_section_div = soup.select_one("div.head-title h1")
title_div = soup.select_one("div.head-title2 h1")
section = section_div.get_text(strip=True) if section_div else ""
sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
title = title_div.get_text(strip=True) if title_div else ""
# Extract verses
verse_blocks = soup.select("div#centerContent")
results = []
for vb in verse_blocks:
sub_header = vb.select_one("div#sub-header")
if not sub_header:
continue
verse_no_text = sub_header.get_text(strip=True) # e.g. "பாடல் : 001"
verse_no = verse_no_text.replace("பாடல் : ", "").strip()
# Verse text
verse_text_div = vb.select_one("div#p1")
verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""
# Explanation after <h4>பொருளுரை:</h4>
explanation = ""
porul_header = vb.find("h4", string="பொருளுரை:")
if porul_header:
next_p = porul_header.find_next_sibling("p")
if next_p:
explanation = next_p.get_text(strip=True)
results.append({
"number": verse_no,
"song": verse_text,
"meaning": explanation
})
return section, sub_section, title, results
# --------------------------------------------------------------------
# Main logic: Loop from 1..40, gather everything into one JSON.
# We’ll store data in an OrderedDict to preserve insertion order.
# Then at the end, we convert to the nested JSON structure.
sections_dict = OrderedDict() # { section_name: { "name": section_name, "sub-section": OrderedDict(...) } }
for i in range(1, 41):
print(f"process {i}")
section, sub_section, title, verse_list = get_naaladiyar_data(i)
print(f"{section}")
# If this is the first time we see 'section', create it
if section not in sections_dict:
sections_dict[section] = {
"name": section,
"sub-section": OrderedDict()
}
# For simplicity, we only use the 'title' as sub-section name in the final JSON.
# But you could also incorporate 'sub_section' if you want a deeper structure.
subs = sections_dict[section]["sub-section"]
if title not in subs:
subs[title] = {
"name": title,
"songs": []
}
# Append these verses
subs[title]["songs"].extend(verse_list)
print("END")
# Now build the final JSON structure
json_data = {
"section": []
}
for sec_data in sections_dict.values():
# convert the 'sub-section' dict to a list
sub_section_list = list(sec_data["sub-section"].values())
sec_data["sub-section"] = sub_section_list
json_data["section"].append(sec_data)
# Print it
json_output = json.dumps(json_data, ensure_ascii=False, indent=2)
print(json_output)
# Optionally save to a file:
with open("naaladiyar_all.json", "w", encoding="utf-8") as f:
f.write(json_output)
import requests
from bs4 import BeautifulSoup
# Example URL or local file
url = "http://www.sangathamizh.com/18keezh-kanakku/18keezh-kanakku-iniyavainatpathu-%E0%AE%87%E0%AE%A9%E0%AE%BF%E0%AE%AF%E0%AE%B5%E0%AF%88%E0%AE%A8%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%AA%E0%AE%A4%E0%AF%81.html"
response = requests.get(url)
# Force response to UTF-8 if needed:
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
# 1) Extract the three top headings if needed
# In your HTML, they appear in these classes/IDs:
# - <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
# - <div class="head-title"><h1>துறவறவியல்</h1></div>
# - <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
#
# Adjust according to whichever text you actually want to treat as
# "Section", "Sub-section", or "Title".
section_div = soup.select_one("div.head-back-grnd h1")
sub_section_div = soup.select_one("div.head-title h1")
title_div = soup.select_one("div.head-title2 h1")
section = section_div.get_text(strip=True) if section_div else ""
sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
title = title_div.get_text(strip=True) if title_div else ""
# Print or store them
print("Section:", section)
print("Sub-section:", sub_section)
print("Title:", title)
# 2) Extract each verse block
# Verses are inside <div id="centerContent"> blocks,
# Each has a <div id="sub-header">பாடல் : 001</div>
# Then <div id="p1"> verse text </div>
# Then <h4>பொருளுரை:</h4> + <p> explanation </p>
all_verse_blocks = soup.select("div#centerContent")
results = []
for verse_block in all_verse_blocks:
# Find the verse number
sub_header = verse_block.select_one("div#sub-header")
if not sub_header:
# This might be a <div id="centerContent"> that doesn't contain a verse;
# just skip it if it doesn't have a verse number
continue
verse_no_text = sub_header.get_text(strip=True)
# Typically the text is like "பாடல் : 001"; extract the numeric part
verse_no = verse_no_text.replace("பாடல் : ", "").strip()
# Verse text is inside <div id="p1">
verse_text_div = verse_block.select_one("div#p1")
verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""
# The explanation is right after <h4>பொருளுரை:</h4>
# One way is to locate that <h4> first, then get the next <p>
explanation = ""
porul_header = verse_block.find("h4", string="விளக்கம்:") # change to பொருளுரை for some pages.
if porul_header:
next_p = porul_header.find_next_sibling("p")
if next_p:
explanation = next_p.get_text(strip=True)
# Store the extracted info
results.append({
"பாடல் எண்": verse_no,
"பாடல்": verse_text,
"பொருளுரை": explanation
})
# 3) Print (or do something else with) the extracted data
temp_store = ""
for item in results:
temp_store = temp_store + f'பாடல்: {item["பாடல் எண்"]}\n{item["பாடல்"]}\nபொருளுரை:\n{item["பொருளுரை"]}\n\n'
print(temp_store)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment