GuyMicciche · August 17, 2023 21:03
diff --git a/Code by Zapier - Main HTMLParser.py b/Code by Zapier - Main HTMLParser.py
 """
 Description:
    This code implements a specialized HTML parser using Python's built-in 
    HTMLParser module. Its primary function is to extract specific content 
    from HTML based on user-defined tags and attributes. The code is tailored 
    to capture content from <h2> tags, <p> tags with the class 'themeScrp', 
    and <div> tags with a 'data-date' attribute of value 'tabContent'. 
    Once parsed, the extracted data is then processed to produce a concise 
    result containing the date, the full scripture text, a scripture reference, 
    and the daily text. This parser provides a flexible and efficient solution 
    for extracting relevant information from structured HTML content.

 Author: 
    Guy Micciche
 """
 from html.parser import HTMLParser
 import re
 from collections.abc import Mapping
 from datetime import datetime

 # THE HTML ALWAYS CONTAINS 3 DAILY TEXTS, YESTERDAY (0), TODAY (1), and TOMORROW (2).
 # ALWAYS GET TODAY
 day = 1 # TODAY

 # Get the current date
 current_date = datetime.now()
 # Format the date
 formatted_date = current_date.strftime('%Y-%m-%d')

 class MyHTMLParser(HTMLParser):
    def __init__(self, tags_to_search):
        super().__init__()
        self.tags_to_search = tags_to_search
        self.recording = {tag: False for tag in tags_to_search}
        self.current_data = {}
        self.nested_count = {}
        self.contents = {tag: [] for tag in tags_to_search}

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        criteria = self.tags_to_search.get(tag)
    
        if criteria is not None:
            if isinstance(criteria, Mapping):  # Checks if criteria is a dictionary
                if all(v in attrs.get(k, "") for k, v in criteria.items()):
                    self.start_recording(tag)
            else:
                self.start_recording(tag)

    def handle_endtag(self, tag):
        if tag in self.recording and self.recording[tag]:
            self.nested_count[tag] -= 1
            if self.nested_count[tag] == 0:
                content = ''.join(self.current_data[tag]).strip()
                self.contents[tag].append(content)
                self.recording[tag] = False

    def handle_data(self, data):
        for tag, recording in self.recording.items():
            if recording:
                self.current_data[tag].append(data)

    def start_recording(self, tag):
        self.recording[tag] = True
        self.current_data[tag] = []
        self.nested_count[tag] = self.nested_count.get(tag, 0) + 1

 tags_to_search = {
    "h2": {},
    "p": {"class": "themeScrp"},
    "div": {"data-date": formatted_date} # will always be today's date, use "div": {"class": "tabContent"} if you want to get it by the "day" variable at top, adjust the tabContent_text at the bottom to use "day" variable
 }

 parser = MyHTMLParser(tags_to_search)
 parser.feed(inputData['htmlContent'])

 # YOU CAN OUTPUT JUST THE TAGS, OR FORMAT THEM
 # UNFORMATTED TAGS
 """
 result = {}
 for tag in tags_to_search:
    result[tag] = parser.contents[tag][1] if len(parser.contents[tag]) > 1 else 'Not Found'
 """

 # FORMATTED TAGS
 h2_text = parser.contents["h2"][day] if len(parser.contents["h2"]) > day else 'Not Found'
 themeScrp_text = parser.contents["p"][day] if len(parser.contents["p"]) > day else 'Not Found'
 tabContent_text = parser.contents["div"][0] if len(parser.contents["div"]) > 0 else 'Not Found'

 # Extracting scripture from themeScrp_text based on pattern
 scripture_match = re.search(r'—(.*? \d+:\d+)', themeScrp_text)
 scripture_text = scripture_match.group(1) if scripture_match else 'Not Found'

 result = {
    'dateText': h2_text,
    'scriptureFull': themeScrp_text,
    'scriptureReference': scripture_text,
    'dailyText': tabContent_text
 }

 return result
	"""
	Description:
	This code implements a specialized HTML parser using Python's built-in
	HTMLParser module. Its primary function is to extract specific content
	from HTML based on user-defined tags and attributes. The code is tailored
	to capture content from <h2> tags, <p> tags with the class 'themeScrp',
	and <div> tags with a 'data-date' attribute of value 'tabContent'.
	Once parsed, the extracted data is then processed to produce a concise
	result containing the date, the full scripture text, a scripture reference,
	and the daily text. This parser provides a flexible and efficient solution
	for extracting relevant information from structured HTML content.

	Author:
	Guy Micciche
	"""
	from html.parser import HTMLParser
	import re
	from collections.abc import Mapping
	from datetime import datetime

	# THE HTML ALWAYS CONTAINS 3 DAILY TEXTS, YESTERDAY (0), TODAY (1), and TOMORROW (2).
	# ALWAYS GET TODAY
	day = 1 # TODAY

	# Get the current date
	current_date = datetime.now()
	# Format the date
	formatted_date = current_date.strftime('%Y-%m-%d')

	class MyHTMLParser(HTMLParser):
	def __init__(self, tags_to_search):
	super().__init__()
	self.tags_to_search = tags_to_search
	self.recording = {tag: False for tag in tags_to_search}
	self.current_data = {}
	self.nested_count = {}
	self.contents = {tag: [] for tag in tags_to_search}

	def handle_starttag(self, tag, attrs):
	attrs = dict(attrs)
	criteria = self.tags_to_search.get(tag)

	if criteria is not None:
	if isinstance(criteria, Mapping): # Checks if criteria is a dictionary
	if all(v in attrs.get(k, "") for k, v in criteria.items()):
	self.start_recording(tag)
	else:
	self.start_recording(tag)

	def handle_endtag(self, tag):
	if tag in self.recording and self.recording[tag]:
	self.nested_count[tag] -= 1
	if self.nested_count[tag] == 0:
	content = ''.join(self.current_data[tag]).strip()
	self.contents[tag].append(content)
	self.recording[tag] = False

	def handle_data(self, data):
	for tag, recording in self.recording.items():
	if recording:
	self.current_data[tag].append(data)

	def start_recording(self, tag):
	self.recording[tag] = True
	self.current_data[tag] = []
	self.nested_count[tag] = self.nested_count.get(tag, 0) + 1

	tags_to_search = {
	"h2": {},
	"p": {"class": "themeScrp"},
	"div": {"data-date": formatted_date} # will always be today's date, use "div": {"class": "tabContent"} if you want to get it by the "day" variable at top, adjust the tabContent_text at the bottom to use "day" variable
	}

	parser = MyHTMLParser(tags_to_search)
	parser.feed(inputData['htmlContent'])

	# YOU CAN OUTPUT JUST THE TAGS, OR FORMAT THEM
	# UNFORMATTED TAGS
	"""
	result = {}
	for tag in tags_to_search:
	result[tag] = parser.contents[tag][1] if len(parser.contents[tag]) > 1 else 'Not Found'
	"""

	# FORMATTED TAGS
	h2_text = parser.contents["h2"][day] if len(parser.contents["h2"]) > day else 'Not Found'
	themeScrp_text = parser.contents["p"][day] if len(parser.contents["p"]) > day else 'Not Found'
	tabContent_text = parser.contents["div"][0] if len(parser.contents["div"]) > 0 else 'Not Found'

	# Extracting scripture from themeScrp_text based on pattern
	scripture_match = re.search(r'—(.*? \d+:\d+)', themeScrp_text)
	scripture_text = scripture_match.group(1) if scripture_match else 'Not Found'

	result = {
	'dateText': h2_text,
	'scriptureFull': themeScrp_text,
	'scriptureReference': scripture_text,
	'dailyText': tabContent_text
	}

	return result