Skip to content

Instantly share code, notes, and snippets.

@russdill
Created January 6, 2016 13:08
Show Gist options
  • Save russdill/19baba11cda88c3003ed to your computer and use it in GitHub Desktop.
Save russdill/19baba11cda88c3003ed to your computer and use it in GitHub Desktop.
LynqSat scraper
#!/usr/bin/env python
from urlparse import urljoin
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, ElementTree
import urllib2
import os.path
import sys
import guide_data
import re
import fuzzywuzzy.process
def normalize(name):
name = re.sub('Television', 'TV', name)
name = re.sub('USA', 'US', name)
name = re.sub('[ -]+(East|West|Pacific|Atlantic)$', '', name)
name = re.sub('([^ -])(DT|HD|TV|CD|SD)(\d*)( |$)', '\g<1> \g<2>\g<3>\g<4>', name)
return name
# 2 letter country code or 2 letter country - state, eg us-ny
countries = sys.argv[1:]
if not len(countries):
countries = ['us']
mapname = 'LyngSat'
print 'Opening guide data...'
et = guide_data.guide_data()
icons = dict()
icons_normalized = dict()
for country in countries:
baseurl = 'http://www.lyngsat-logo.com/tvcountry/%s.html' % country
print 'Fetching %s...' % baseurl
page = urllib2.urlopen(baseurl).read()
#page = urllib2.urlopen('file:///home/russ/src/atropine/%s.html' % country).read()
print 'Processing...'
soup = BeautifulSoup(page)
for td in soup.find_all('td'):
a = td.find_all('a')
if len(a) == 2:
try:
name = a[1].text
icons[name] = urljoin(baseurl, a[0].find_all('img')[0]['src'])
icons_normalized[normalize(name)] = name
except:
pass
def match(callsign, network, method):
callsigns[callsign] = icons_normalized[network]
print '%s [%s]' % (icons_normalized[network], method)
print 'Building chansign mapping...'
lower_keys = dict()
for n in icons_normalized.keys():
lower_keys[n.lower()] = n
callsigns = dict()
for station in et.stations.values():
text = '%s/%s...' % (station.callSign, station.name)
print '%-60s' % text,
# Strip off anything in parens
name = re.sub('\(.*\)', '', station.name).strip()
name = normalize(name)
if name in icons_normalized:
match(station.callSign, name, 'exact')
continue
try:
match(station.callSign, lower_keys[name.lower()], 'case')
continue
except:
pass
if station.callSign in icons_normalized:
match(station.callSign, station.callSign, 'exact(callsign)')
continue
try:
match(station.callSign, lower_keys[station.callSign.lower()], 'case(callsign)')
continue
except:
pass
result = fuzzywuzzy.process.extractOne(name, icons_normalized.keys(), score_cutoff=95)
if result:
match(station.callSign, result[0], 'fuzz=%d%%' % result[1])
continue
munges = ['DT', 'HD', 'TV', 'CD', 'SD']
munge = None
for m in munges:
if m in name:
munge = m
munges.remove(m)
munges.append('')
break
munged = False
for to in munges:
if munge:
munged_name = name.replace(m, to)
else:
munged_name = name + ' ' + to
result = fuzzywuzzy.process.extractOne(munged_name, icons_normalized.keys(), score_cutoff=95)
if result:
if munge is None:
munge = '<None>'
if to is '':
to = '<None>'
match(station.callSign, result[0], 'munge %s to %s, fuzz=%d%%' % (munge, to, result[1]))
munged = True
break
if munged:
continue
tokens = name.split()
suffix = None
if tokens[-1] in ('DT', 'HD', 'CD', 'SD'):
suffix = tokens[-1]
del tokens[-1]
if len(tokens) > 2:
abbrev = ''.join([x[0] for x in tokens])
result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
if result:
match(station.callSign, result[0], 'abbrev, fuzz=%d%%' % result[1])
continue
if len(tokens) > 1 and 'TV' in tokens:
abbrev = ''.join([x if x == 'TV' else x[0] for x in tokens])
result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
if result:
match(station.callSign, result[0], 'abbrev w/TV, fuzz=%d%%' % result[1])
continue
print 'no match'
prefix = os.path.commonprefix(icons.values()).rstrip('/')
strip_len = len(prefix)
base = '[' + mapname + ']'
outfile = 'iconmap-%s.xml' % mapname
print 'Saving %s, %d icon mappings found' % (outfile, len(icons))
top = Element('iconmappings')
for callsign, network in callsigns.iteritems():
c1 = SubElement(top, 'callsigntonetwork')
SubElement(c1, 'callsign').text = callsign
SubElement(c1, 'network').text = network
for network, url in icons.iteritems():
c1 = SubElement(top, 'networktourl')
SubElement(c1, 'network').text = network
SubElement(c1, 'url').text = base + url[strip_len:]
c1 = SubElement(top, 'baseurl')
SubElement(c1, 'stub').text = mapname
SubElement(c1, 'url').text = prefix
ElementTree(top).write(outfile, encoding='utf-8', xml_declaration=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment