Last active
May 24, 2018 23:43
-
-
Save Jamim/fe70a0652a728a90eb80f5978a862475 to your computer and use it in GitHub Desktop.
Simple helper for iservice.by
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from csv import writer | |
from os import listdir | |
from os.path import isfile | |
from sys import argv | |
import logging | |
import re | |
logger = logging.getLogger('iservice') | |
HTML_FILENAME_REGEX = re.compile(r'.*\.html?') | |
TITLES = [ | |
'Имя компьютера', | |
'Тип ЦП', | |
'Системная плата', | |
'Дисковый накопитель', | |
'Тип ядра ОС', | |
] | |
TITLES_REGEXPS = [ | |
( | |
title, | |
re.compile( | |
f'<TR>.*<TD>{title} <TD>' | |
r'(?:<A.*?>)?(?P<value>.*?)(?:</A>)?$', | |
re.MULTILINE | |
) | |
) for title in TITLES | |
] | |
SUMMARY_REGEXP = re.compile( | |
r'<A NAME="summary">.*?</TABLE>' | |
r'<TABLE>(?P<summary>.*?)</TABLE>', | |
re.DOTALL | |
) | |
def parse(filename): | |
with open(filename, encoding='cp1251', errors='ignore') as input_file: | |
data = input_file.read() | |
summary = SUMMARY_REGEXP.search(data).group('summary') | |
row = [] | |
for title, regex in TITLES_REGEXPS: | |
values = [] | |
for match in regex.finditer(summary): | |
value = match.group('value') | |
if title != 'Дисковый накопитель' or 'USB Device' not in value: | |
values.append(value) | |
if title == 'Имя компьютера': | |
break | |
if values: | |
row.append('\n'.join(values)) | |
else: | |
match = regex.search(data) | |
row.append(match.group('value') if match else '') | |
return row | |
def main(files): | |
if not files: | |
files = [ | |
filename for filename in listdir('.') | |
if HTML_FILENAME_REGEX.match(filename) and isfile(filename) | |
] | |
with open('report.csv', 'w', encoding='cp1251') as output_file: | |
csv = writer(output_file) | |
csv.writerow(TITLES) | |
for filename in files: | |
try: | |
row = parse(filename) | |
except: | |
logger.error(f'Can not parse the "{filename}" file') | |
else: | |
csv.writerow(row) | |
if __name__ == '__main__': | |
main(argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Requirements
Python 3.6 or higher
How to use
python parser.py <filename1> <filename2> …
or
python parser.py
(will parse allhtml
andhtm
files at current directory)Output
report.csv
file