Last active
August 17, 2023 21:03
-
-
Save GuyMicciche/25328acd1551d6a93bc95c43c9cb0ce2 to your computer and use it in GitHub Desktop.
Extracts specific content from HTML based on a dictionary input, and then processes the data into a structured output.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Description: | |
This code implements a specialized HTML parser using Python's built-in | |
HTMLParser module. Its primary function is to extract specific content | |
from HTML based on user-defined tags and attributes. The code is tailored | |
to capture content from <h2> tags, <p> tags with the class 'themeScrp', | |
and <div> tags with a 'data-date' attribute of value 'tabContent'. | |
Once parsed, the extracted data is then processed to produce a concise | |
result containing the date, the full scripture text, a scripture reference, | |
and the daily text. This parser provides a flexible and efficient solution | |
for extracting relevant information from structured HTML content. | |
Author: | |
Guy Micciche | |
""" | |
from html.parser import HTMLParser | |
import re | |
from collections.abc import Mapping | |
from datetime import datetime | |
# THE HTML ALWAYS CONTAINS 3 DAILY TEXTS, YESTERDAY (0), TODAY (1), and TOMORROW (2). | |
# ALWAYS GET TODAY | |
day = 1 # TODAY | |
# Get the current date | |
current_date = datetime.now() | |
# Format the date | |
formatted_date = current_date.strftime('%Y-%m-%d') | |
class MyHTMLParser(HTMLParser): | |
def __init__(self, tags_to_search): | |
super().__init__() | |
self.tags_to_search = tags_to_search | |
self.recording = {tag: False for tag in tags_to_search} | |
self.current_data = {} | |
self.nested_count = {} | |
self.contents = {tag: [] for tag in tags_to_search} | |
def handle_starttag(self, tag, attrs): | |
attrs = dict(attrs) | |
criteria = self.tags_to_search.get(tag) | |
if criteria is not None: | |
if isinstance(criteria, Mapping): # Checks if criteria is a dictionary | |
if all(v in attrs.get(k, "") for k, v in criteria.items()): | |
self.start_recording(tag) | |
else: | |
self.start_recording(tag) | |
def handle_endtag(self, tag): | |
if tag in self.recording and self.recording[tag]: | |
self.nested_count[tag] -= 1 | |
if self.nested_count[tag] == 0: | |
content = ''.join(self.current_data[tag]).strip() | |
self.contents[tag].append(content) | |
self.recording[tag] = False | |
def handle_data(self, data): | |
for tag, recording in self.recording.items(): | |
if recording: | |
self.current_data[tag].append(data) | |
def start_recording(self, tag): | |
self.recording[tag] = True | |
self.current_data[tag] = [] | |
self.nested_count[tag] = self.nested_count.get(tag, 0) + 1 | |
tags_to_search = { | |
"h2": {}, | |
"p": {"class": "themeScrp"}, | |
"div": {"data-date": formatted_date} # will always be today's date, use "div": {"class": "tabContent"} if you want to get it by the "day" variable at top, adjust the tabContent_text at the bottom to use "day" variable | |
} | |
parser = MyHTMLParser(tags_to_search) | |
parser.feed(inputData['htmlContent']) | |
# YOU CAN OUTPUT JUST THE TAGS, OR FORMAT THEM | |
# UNFORMATTED TAGS | |
""" | |
result = {} | |
for tag in tags_to_search: | |
result[tag] = parser.contents[tag][1] if len(parser.contents[tag]) > 1 else 'Not Found' | |
""" | |
# FORMATTED TAGS | |
h2_text = parser.contents["h2"][day] if len(parser.contents["h2"]) > day else 'Not Found' | |
themeScrp_text = parser.contents["p"][day] if len(parser.contents["p"]) > day else 'Not Found' | |
tabContent_text = parser.contents["div"][0] if len(parser.contents["div"]) > 0 else 'Not Found' | |
# Extracting scripture from themeScrp_text based on pattern | |
scripture_match = re.search(r'—(.*? \d+:\d+)', themeScrp_text) | |
scripture_text = scripture_match.group(1) if scripture_match else 'Not Found' | |
result = { | |
'dateText': h2_text, | |
'scriptureFull': themeScrp_text, | |
'scriptureReference': scripture_text, | |
'dailyText': tabContent_text | |
} | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment