Skip to content

Instantly share code, notes, and snippets.

@Jamim
Last active May 24, 2018 23:43
Show Gist options
  • Save Jamim/fe70a0652a728a90eb80f5978a862475 to your computer and use it in GitHub Desktop.
Save Jamim/fe70a0652a728a90eb80f5978a862475 to your computer and use it in GitHub Desktop.
Simple helper for iservice.by
from csv import writer
from os import listdir
from os.path import isfile
from sys import argv
import logging
import re
logger = logging.getLogger('iservice')
HTML_FILENAME_REGEX = re.compile(r'.*\.html?')
TITLES = [
'Имя компьютера',
'Тип ЦП',
'Системная плата',
'Дисковый накопитель',
'Тип ядра ОС',
]
TITLES_REGEXPS = [
(
title,
re.compile(
f'<TR>.*<TD>{title}&nbsp;&nbsp;<TD>'
r'(?:<A.*?>)?(?P<value>.*?)(?:</A>)?$',
re.MULTILINE
)
) for title in TITLES
]
SUMMARY_REGEXP = re.compile(
r'<A NAME="summary">.*?</TABLE>'
r'<TABLE>(?P<summary>.*?)</TABLE>',
re.DOTALL
)
def parse(filename):
with open(filename, encoding='cp1251', errors='ignore') as input_file:
data = input_file.read()
summary = SUMMARY_REGEXP.search(data).group('summary')
row = []
for title, regex in TITLES_REGEXPS:
values = []
for match in regex.finditer(summary):
value = match.group('value')
if title != 'Дисковый накопитель' or 'USB Device' not in value:
values.append(value)
if title == 'Имя компьютера':
break
if values:
row.append('\n'.join(values))
else:
match = regex.search(data)
row.append(match.group('value') if match else '')
return row
def main(files):
if not files:
files = [
filename for filename in listdir('.')
if HTML_FILENAME_REGEX.match(filename) and isfile(filename)
]
with open('report.csv', 'w', encoding='cp1251') as output_file:
csv = writer(output_file)
csv.writerow(TITLES)
for filename in files:
try:
row = parse(filename)
except:
logger.error(f'Can not parse the "{filename}" file')
else:
csv.writerow(row)
if __name__ == '__main__':
main(argv[1:])
@Jamim
Copy link
Author

Jamim commented May 19, 2018

Requirements

Python 3.6 or higher

How to use

python parser.py <filename1> <filename2> …
or
python parser.py (will parse all html and htm files at current directory)

Output

report.csv file

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment