Last active
February 10, 2020 23:28
-
-
Save pypt/915c0fd5bf4258bee584221bfb5514db to your computer and use it in GitHub Desktop.
Validate new feed parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.7 | |
import calendar | |
import os | |
import time | |
from mediawords.feed.parse import parse_feed | |
input_dir = '/feeds/' | |
output_dir = '/feed_results_new/' | |
assert os.path.isdir(input_dir) | |
assert os.path.isdir(output_dir) | |
def sql_date_to_timestamp(date: str) -> int: | |
return calendar.timegm(time.strptime(date, "%Y-%m-%d %H:%M:%S")) | |
def is_non_empty_file(fpath): | |
return os.path.isfile(fpath) and os.path.getsize(fpath) > 0 | |
for input_filename in os.listdir(input_dir): | |
if input_filename.endswith('.xml'): | |
input_path = f"{input_dir}/{input_filename}" | |
output_filename = os.path.splitext(input_filename)[0] | |
output_path = f"{output_dir}/{output_filename}" | |
if not is_non_empty_file(output_path): | |
feed_contents = open(input_path, 'rb').read() | |
# Some feeds have encoding problems | |
feed_contents = feed_contents.decode('utf-8', errors='replace') | |
parsed_feed = parse_feed(feed_contents) | |
if parsed_feed: | |
parse_succeeded = 1 | |
item_count = len(parsed_feed.items()) | |
feed_title_length = len(parsed_feed.title() or '') | |
total_items_title_length = 0 | |
total_items_description_length = 0 | |
total_items_defined_publish_dates = 0 | |
total_items_timestamp = 0 | |
for item in parsed_feed.items(): | |
total_items_title_length += len(item.title() or '') | |
total_items_description_length += len(item.description() or '') | |
# print(item.description() + "\n") | |
if item.publish_date_sql(): | |
total_items_defined_publish_dates += 1 | |
try: | |
timestamp = sql_date_to_timestamp(item.publish_date_sql()) | |
print(item.publish_date_sql() + "\t" + str(timestamp)) | |
total_items_timestamp += timestamp | |
except Exception as ex: | |
raise Exception(f"Unable to parse publish date for feed {output_filename}: {ex}") | |
if total_items_defined_publish_dates: | |
average_item_timestamp = int(total_items_timestamp / total_items_defined_publish_dates) | |
else: | |
average_item_timestamp = '' | |
else: | |
parse_succeeded = 0 | |
item_count = '' | |
feed_title_length = '' | |
total_items_title_length = '' | |
total_items_description_length = '' | |
total_items_defined_publish_dates = '' | |
average_item_timestamp = '' | |
result = f"{output_filename}\t{parse_succeeded}\t{item_count}\t{feed_title_length}\t{total_items_title_length}\t{total_items_description_length}\t{total_items_defined_publish_dates}\t{average_item_timestamp}\n" | |
print(result) | |
f = open(output_path, 'w') | |
f.write(result) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment