Last active
February 14, 2025 08:00
-
-
Save RageshAntonyHM/81b2f9f117c9b74dce6d68b710397a95 to your computer and use it in GitHub Desktop.
Tamil Scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
from bs4 import BeautifulSoup | |
from collections import OrderedDict | |
def get_naaladiyar_data(page_num): | |
""" | |
Given an integer page_num (1..40), build the URL, scrape, and return: | |
section, sub_section, title, results | |
where results is a list of: | |
{ | |
"number": verse_no, | |
"song": verse_text, | |
"meaning": explanation | |
} | |
""" | |
# Build page number as 01, 02, ... 40 | |
page_str = f"{page_num:02d}" # zero-padded | |
url = f"http://www.sangathamizh.com/18keezh-kanakku/naaladiyar/naaladiyar-நாலடியார்{page_str}.html" | |
response = requests.get(url) | |
response.encoding = "utf-8" # ensure correct decoding | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Extract top headings: | |
# <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div> | |
# <div class="head-title"><h1>துறவறவியல்</h1></div> (sometimes you may not need this) | |
# <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div> | |
section_div = soup.select_one("div.head-back-grnd h1") | |
sub_section_div = soup.select_one("div.head-title h1") | |
title_div = soup.select_one("div.head-title2 h1") | |
section = section_div.get_text(strip=True) if section_div else "" | |
sub_section = sub_section_div.get_text(strip=True) if sub_section_div else "" | |
title = title_div.get_text(strip=True) if title_div else "" | |
# Extract verses | |
verse_blocks = soup.select("div#centerContent") | |
results = [] | |
for vb in verse_blocks: | |
sub_header = vb.select_one("div#sub-header") | |
if not sub_header: | |
continue | |
verse_no_text = sub_header.get_text(strip=True) # e.g. "பாடல் : 001" | |
verse_no = verse_no_text.replace("பாடல் : ", "").strip() | |
# Verse text | |
verse_text_div = vb.select_one("div#p1") | |
verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else "" | |
# Explanation after <h4>பொருளுரை:</h4> | |
explanation = "" | |
porul_header = vb.find("h4", string="பொருளுரை:") | |
if porul_header: | |
next_p = porul_header.find_next_sibling("p") | |
if next_p: | |
explanation = next_p.get_text(strip=True) | |
results.append({ | |
"number": verse_no, | |
"song": verse_text, | |
"meaning": explanation | |
}) | |
return section, sub_section, title, results | |
# -------------------------------------------------------------------- | |
# Main logic: Loop from 1..40, gather everything into one JSON. | |
# We’ll store data in an OrderedDict to preserve insertion order. | |
# Then at the end, we convert to the nested JSON structure. | |
sections_dict = OrderedDict() # { section_name: { "name": section_name, "sub-section": OrderedDict(...) } } | |
for i in range(1, 41): | |
print(f"process {i}") | |
section, sub_section, title, verse_list = get_naaladiyar_data(i) | |
print(f"{section}") | |
# If this is the first time we see 'section', create it | |
if section not in sections_dict: | |
sections_dict[section] = { | |
"name": section, | |
"sub-section": OrderedDict() | |
} | |
# For simplicity, we only use the 'title' as sub-section name in the final JSON. | |
# But you could also incorporate 'sub_section' if you want a deeper structure. | |
subs = sections_dict[section]["sub-section"] | |
if title not in subs: | |
subs[title] = { | |
"name": title, | |
"songs": [] | |
} | |
# Append these verses | |
subs[title]["songs"].extend(verse_list) | |
print("END") | |
# Now build the final JSON structure | |
json_data = { | |
"section": [] | |
} | |
for sec_data in sections_dict.values(): | |
# convert the 'sub-section' dict to a list | |
sub_section_list = list(sec_data["sub-section"].values()) | |
sec_data["sub-section"] = sub_section_list | |
json_data["section"].append(sec_data) | |
# Print it | |
json_output = json.dumps(json_data, ensure_ascii=False, indent=2) | |
print(json_output) | |
# Optionally save to a file: | |
with open("naaladiyar_all.json", "w", encoding="utf-8") as f: | |
f.write(json_output) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# Example URL or local file | |
url = "http://www.sangathamizh.com/18keezh-kanakku/18keezh-kanakku-iniyavainatpathu-%E0%AE%87%E0%AE%A9%E0%AE%BF%E0%AE%AF%E0%AE%B5%E0%AF%88%E0%AE%A8%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%AA%E0%AE%A4%E0%AF%81.html" | |
response = requests.get(url) | |
# Force response to UTF-8 if needed: | |
response.encoding = "utf-8" | |
soup = BeautifulSoup(response.text, "html.parser") | |
# 1) Extract the three top headings if needed | |
# In your HTML, they appear in these classes/IDs: | |
# - <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div> | |
# - <div class="head-title"><h1>துறவறவியல்</h1></div> | |
# - <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div> | |
# | |
# Adjust according to whichever text you actually want to treat as | |
# "Section", "Sub-section", or "Title". | |
section_div = soup.select_one("div.head-back-grnd h1") | |
sub_section_div = soup.select_one("div.head-title h1") | |
title_div = soup.select_one("div.head-title2 h1") | |
section = section_div.get_text(strip=True) if section_div else "" | |
sub_section = sub_section_div.get_text(strip=True) if sub_section_div else "" | |
title = title_div.get_text(strip=True) if title_div else "" | |
# Print or store them | |
print("Section:", section) | |
print("Sub-section:", sub_section) | |
print("Title:", title) | |
# 2) Extract each verse block | |
# Verses are inside <div id="centerContent"> blocks, | |
# Each has a <div id="sub-header">பாடல் : 001</div> | |
# Then <div id="p1"> verse text </div> | |
# Then <h4>பொருளுரை:</h4> + <p> explanation </p> | |
all_verse_blocks = soup.select("div#centerContent") | |
results = [] | |
for verse_block in all_verse_blocks: | |
# Find the verse number | |
sub_header = verse_block.select_one("div#sub-header") | |
if not sub_header: | |
# This might be a <div id="centerContent"> that doesn't contain a verse; | |
# just skip it if it doesn't have a verse number | |
continue | |
verse_no_text = sub_header.get_text(strip=True) | |
# Typically the text is like "பாடல் : 001"; extract the numeric part | |
verse_no = verse_no_text.replace("பாடல் : ", "").strip() | |
# Verse text is inside <div id="p1"> | |
verse_text_div = verse_block.select_one("div#p1") | |
verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else "" | |
# The explanation is right after <h4>பொருளுரை:</h4> | |
# One way is to locate that <h4> first, then get the next <p> | |
explanation = "" | |
porul_header = verse_block.find("h4", string="விளக்கம்:") # change to பொருளுரை for some pages. | |
if porul_header: | |
next_p = porul_header.find_next_sibling("p") | |
if next_p: | |
explanation = next_p.get_text(strip=True) | |
# Store the extracted info | |
results.append({ | |
"பாடல் எண்": verse_no, | |
"பாடல்": verse_text, | |
"பொருளுரை": explanation | |
}) | |
# 3) Print (or do something else with) the extracted data | |
temp_store = "" | |
for item in results: | |
temp_store = temp_store + f'பாடல்: {item["பாடல் எண்"]}\n{item["பாடல்"]}\nபொருளுரை:\n{item["பொருளுரை"]}\n\n' | |
print(temp_store) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment