RageshAntonyHM · February 14, 2025 08:00
diff --git a/naldi_scrap.py b/naldi_scrap.py
 import json
 import requests
 from bs4 import BeautifulSoup
 from collections import OrderedDict

 def get_naaladiyar_data(page_num):
    """
    Given an integer page_num (1..40), build the URL, scrape, and return:
      section, sub_section, title, results
    where results is a list of:
      {
         "number": verse_no,
         "song": verse_text,
         "meaning": explanation
      }
    """
    # Build page number as 01, 02, ... 40
    page_str = f"{page_num:02d}"  # zero-padded
    url = f"http://www.sangathamizh.com/18keezh-kanakku/naaladiyar/naaladiyar-நாலடியார்{page_str}.html"

    response = requests.get(url)
    response.encoding = "utf-8"  # ensure correct decoding
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract top headings:
    #   <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
    #   <div class="head-title"><h1>துறவறவியல்</h1></div>   (sometimes you may not need this)
    #   <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
    section_div = soup.select_one("div.head-back-grnd h1")
    sub_section_div = soup.select_one("div.head-title h1")
    title_div = soup.select_one("div.head-title2 h1")

    section = section_div.get_text(strip=True) if section_div else ""
    sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
    title = title_div.get_text(strip=True) if title_div else ""

    # Extract verses
    verse_blocks = soup.select("div#centerContent")
    results = []
    for vb in verse_blocks:
        sub_header = vb.select_one("div#sub-header")
        if not sub_header:
            continue

        verse_no_text = sub_header.get_text(strip=True)  # e.g. "பாடல் : 001"
        verse_no = verse_no_text.replace("பாடல் : ", "").strip()

        # Verse text
        verse_text_div = vb.select_one("div#p1")
        verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""

        # Explanation after <h4>பொருளுரை:</h4>
        explanation = ""
        porul_header = vb.find("h4", string="பொருளுரை:")
        if porul_header:
            next_p = porul_header.find_next_sibling("p")
            if next_p:
                explanation = next_p.get_text(strip=True)

        results.append({
            "number": verse_no,
            "song": verse_text,
            "meaning": explanation
        })

    return section, sub_section, title, results

 # --------------------------------------------------------------------
 # Main logic: Loop from 1..40, gather everything into one JSON.
 # We’ll store data in an OrderedDict to preserve insertion order.
 # Then at the end, we convert to the nested JSON structure.

 sections_dict = OrderedDict()  # { section_name: { "name": section_name, "sub-section": OrderedDict(...) } }

 for i in range(1, 41):
    print(f"process {i}")
    section, sub_section, title, verse_list = get_naaladiyar_data(i)
    print(f"{section}")

    # If this is the first time we see 'section', create it
    if section not in sections_dict:
        sections_dict[section] = {
            "name": section,
            "sub-section": OrderedDict()
        }

    # For simplicity, we only use the 'title' as sub-section name in the final JSON.
    # But you could also incorporate 'sub_section' if you want a deeper structure.
    subs = sections_dict[section]["sub-section"]
    if title not in subs:
        subs[title] = {
            "name": title,
            "songs": []
        }

    # Append these verses
    subs[title]["songs"].extend(verse_list)
    print("END")

 # Now build the final JSON structure
 json_data = {
    "section": []
 }

 for sec_data in sections_dict.values():
    # convert the 'sub-section' dict to a list
    sub_section_list = list(sec_data["sub-section"].values())
    sec_data["sub-section"] = sub_section_list

    json_data["section"].append(sec_data)

 # Print it
 json_output = json.dumps(json_data, ensure_ascii=False, indent=2)
 print(json_output)

 # Optionally save to a file:
 with open("naaladiyar_all.json", "w", encoding="utf-8") as f:
  f.write(json_output)
diff --git a/single_page_scrap.py b/single_page_scrap.py
 import requests
 from bs4 import BeautifulSoup

 # Example URL or local file
 url = "http://www.sangathamizh.com/18keezh-kanakku/18keezh-kanakku-iniyavainatpathu-%E0%AE%87%E0%AE%A9%E0%AE%BF%E0%AE%AF%E0%AE%B5%E0%AF%88%E0%AE%A8%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%AA%E0%AE%A4%E0%AF%81.html"
 response = requests.get(url)
 # Force response to UTF-8 if needed:
 response.encoding = "utf-8"
 soup = BeautifulSoup(response.text, "html.parser")

 # 1) Extract the three top headings if needed
 #    In your HTML, they appear in these classes/IDs:
 #      - <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
 #      - <div class="head-title"><h1>துறவறவியல்</h1></div>
 #      - <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
 #
 #    Adjust according to whichever text you actually want to treat as
 #    "Section", "Sub-section", or "Title".

 section_div = soup.select_one("div.head-back-grnd h1")
 sub_section_div = soup.select_one("div.head-title h1")
 title_div = soup.select_one("div.head-title2 h1")

 section = section_div.get_text(strip=True) if section_div else ""
 sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
 title = title_div.get_text(strip=True) if title_div else ""

 # Print or store them
 print("Section:", section)
 print("Sub-section:", sub_section)
 print("Title:", title)

 # 2) Extract each verse block
 #    Verses are inside <div id="centerContent"> blocks,
 #    Each has a <div id="sub-header">பாடல் : 001</div>
 #    Then <div id="p1"> verse text </div>
 #    Then <h4>பொருளுரை:</h4> + <p> explanation </p>

 all_verse_blocks = soup.select("div#centerContent")

 results = []
 for verse_block in all_verse_blocks:
    # Find the verse number
    sub_header = verse_block.select_one("div#sub-header")
    if not sub_header:
        # This might be a <div id="centerContent"> that doesn't contain a verse;
        # just skip it if it doesn't have a verse number
        continue

    verse_no_text = sub_header.get_text(strip=True)
    # Typically the text is like "பாடல் : 001"; extract the numeric part
    verse_no = verse_no_text.replace("பாடல் : ", "").strip()

    # Verse text is inside <div id="p1">
    verse_text_div = verse_block.select_one("div#p1")
    verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""

    # The explanation is right after <h4>பொருளுரை:</h4>
    # One way is to locate that <h4> first, then get the next <p>
    explanation = ""
    porul_header = verse_block.find("h4", string="விளக்கம்:")  # change to பொருளுரை for some pages.
    if porul_header:
        next_p = porul_header.find_next_sibling("p")
        if next_p:
            explanation = next_p.get_text(strip=True)

    # Store the extracted info
    results.append({
        "பாடல் எண்": verse_no,
        "பாடல்": verse_text,
        "பொருளுரை": explanation
    })

 # 3) Print (or do something else with) the extracted data
 temp_store = ""
 for item in results:
    temp_store = temp_store + f'பாடல்: {item["பாடல் எண்"]}\n{item["பாடல்"]}\nபொருளுரை:\n{item["பொருளுரை"]}\n\n'
 print(temp_store)
	import json
	import requests
	from bs4 import BeautifulSoup
	from collections import OrderedDict

	def get_naaladiyar_data(page_num):
	"""
	Given an integer page_num (1..40), build the URL, scrape, and return:
	section, sub_section, title, results
	where results is a list of:
	{
	"number": verse_no,
	"song": verse_text,
	"meaning": explanation
	}
	"""
	# Build page number as 01, 02, ... 40
	page_str = f"{page_num:02d}" # zero-padded
	url = f"http://www.sangathamizh.com/18keezh-kanakku/naaladiyar/naaladiyar-நாலடியார்{page_str}.html"

	response = requests.get(url)
	response.encoding = "utf-8" # ensure correct decoding
	soup = BeautifulSoup(response.text, "html.parser")

	# Extract top headings:
	# <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
	# <div class="head-title"><h1>துறவறவியல்</h1></div> (sometimes you may not need this)
	# <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
	section_div = soup.select_one("div.head-back-grnd h1")
	sub_section_div = soup.select_one("div.head-title h1")
	title_div = soup.select_one("div.head-title2 h1")

	section = section_div.get_text(strip=True) if section_div else ""
	sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
	title = title_div.get_text(strip=True) if title_div else ""

	# Extract verses
	verse_blocks = soup.select("div#centerContent")
	results = []
	for vb in verse_blocks:
	sub_header = vb.select_one("div#sub-header")
	if not sub_header:
	continue

	verse_no_text = sub_header.get_text(strip=True) # e.g. "பாடல் : 001"
	verse_no = verse_no_text.replace("பாடல் : ", "").strip()

	# Verse text
	verse_text_div = vb.select_one("div#p1")
	verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""

	# Explanation after <h4>பொருளுரை:</h4>
	explanation = ""
	porul_header = vb.find("h4", string="பொருளுரை:")
	if porul_header:
	next_p = porul_header.find_next_sibling("p")
	if next_p:
	explanation = next_p.get_text(strip=True)

	results.append({
	"number": verse_no,
	"song": verse_text,
	"meaning": explanation
	})

	return section, sub_section, title, results

	# --------------------------------------------------------------------
	# Main logic: Loop from 1..40, gather everything into one JSON.
	# We’ll store data in an OrderedDict to preserve insertion order.
	# Then at the end, we convert to the nested JSON structure.

	sections_dict = OrderedDict() # { section_name: { "name": section_name, "sub-section": OrderedDict(...) } }

	for i in range(1, 41):
	print(f"process {i}")
	section, sub_section, title, verse_list = get_naaladiyar_data(i)
	print(f"{section}")

	# If this is the first time we see 'section', create it
	if section not in sections_dict:
	sections_dict[section] = {
	"name": section,
	"sub-section": OrderedDict()
	}

	# For simplicity, we only use the 'title' as sub-section name in the final JSON.
	# But you could also incorporate 'sub_section' if you want a deeper structure.
	subs = sections_dict[section]["sub-section"]
	if title not in subs:
	subs[title] = {
	"name": title,
	"songs": []
	}

	# Append these verses
	subs[title]["songs"].extend(verse_list)
	print("END")

	# Now build the final JSON structure
	json_data = {
	"section": []
	}

	for sec_data in sections_dict.values():
	# convert the 'sub-section' dict to a list
	sub_section_list = list(sec_data["sub-section"].values())
	sec_data["sub-section"] = sub_section_list

	json_data["section"].append(sec_data)

	# Print it
	json_output = json.dumps(json_data, ensure_ascii=False, indent=2)
	print(json_output)

	# Optionally save to a file:
	with open("naaladiyar_all.json", "w", encoding="utf-8") as f:
	f.write(json_output)
	import requests
	from bs4 import BeautifulSoup

	# Example URL or local file
	url = "http://www.sangathamizh.com/18keezh-kanakku/18keezh-kanakku-iniyavainatpathu-%E0%AE%87%E0%AE%A9%E0%AE%BF%E0%AE%AF%E0%AE%B5%E0%AF%88%E0%AE%A8%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%AA%E0%AE%A4%E0%AF%81.html"
	response = requests.get(url)
	# Force response to UTF-8 if needed:
	response.encoding = "utf-8"
	soup = BeautifulSoup(response.text, "html.parser")

	# 1) Extract the three top headings if needed
	# In your HTML, they appear in these classes/IDs:
	# - <div class="head-back-grnd"><h1>அறத்துப்பால்</h1></div>
	# - <div class="head-title"><h1>துறவறவியல்</h1></div>
	# - <div class="head-title2"><h1>செல்வம் நிலையாமை</h1></div>
	#
	# Adjust according to whichever text you actually want to treat as
	# "Section", "Sub-section", or "Title".

	section_div = soup.select_one("div.head-back-grnd h1")
	sub_section_div = soup.select_one("div.head-title h1")
	title_div = soup.select_one("div.head-title2 h1")

	section = section_div.get_text(strip=True) if section_div else ""
	sub_section = sub_section_div.get_text(strip=True) if sub_section_div else ""
	title = title_div.get_text(strip=True) if title_div else ""

	# Print or store them
	print("Section:", section)
	print("Sub-section:", sub_section)
	print("Title:", title)

	# 2) Extract each verse block
	# Verses are inside <div id="centerContent"> blocks,
	# Each has a <div id="sub-header">பாடல் : 001</div>
	# Then <div id="p1"> verse text </div>
	# Then <h4>பொருளுரை:</h4> + <p> explanation </p>

	all_verse_blocks = soup.select("div#centerContent")

	results = []
	for verse_block in all_verse_blocks:
	# Find the verse number
	sub_header = verse_block.select_one("div#sub-header")
	if not sub_header:
	# This might be a <div id="centerContent"> that doesn't contain a verse;
	# just skip it if it doesn't have a verse number
	continue

	verse_no_text = sub_header.get_text(strip=True)
	# Typically the text is like "பாடல் : 001"; extract the numeric part
	verse_no = verse_no_text.replace("பாடல் : ", "").strip()

	# Verse text is inside <div id="p1">
	verse_text_div = verse_block.select_one("div#p1")
	verse_text = verse_text_div.get_text("\n", strip=True) if verse_text_div else ""

	# The explanation is right after <h4>பொருளுரை:</h4>
	# One way is to locate that <h4> first, then get the next <p>
	explanation = ""
	porul_header = verse_block.find("h4", string="விளக்கம்:") # change to பொருளுரை for some pages.
	if porul_header:
	next_p = porul_header.find_next_sibling("p")
	if next_p:
	explanation = next_p.get_text(strip=True)

	# Store the extracted info
	results.append({
	"பாடல் எண்": verse_no,
	"பாடல்": verse_text,
	"பொருளுரை": explanation
	})

	# 3) Print (or do something else with) the extracted data
	temp_store = ""
	for item in results:
	temp_store = temp_store + f'பாடல்: {item["பாடல் எண்"]}\n{item["பாடல்"]}\nபொருளுரை:\n{item["பொருளுரை"]}\n\n'
	print(temp_store)