russdill · January 6, 2016 13:08
diff --git a/lynq-scrape.py b/lynq-scrape.py
 #!/usr/bin/env python

 from urlparse import urljoin
 from bs4 import BeautifulSoup
 from xml.etree.ElementTree import Element, SubElement, ElementTree

 import urllib2
 import os.path
 import sys
 import guide_data
 import re
 import fuzzywuzzy.process

 def normalize(name):
    name = re.sub('Television', 'TV', name)
    name = re.sub('USA', 'US', name)
    name = re.sub('[ -]+(East|West|Pacific|Atlantic)$', '', name)
    name = re.sub('([^ -])(DT|HD|TV|CD|SD)(\d*)( |$)', '\g<1> \g<2>\g<3>\g<4>', name)

    return name

 # 2 letter country code or 2 letter country - state, eg us-ny
 countries = sys.argv[1:]
 if not len(countries):
    countries = ['us']

 mapname = 'LyngSat'

 print 'Opening guide data...'
 et = guide_data.guide_data()

 icons = dict()
 icons_normalized = dict()

 for country in countries:
    baseurl = 'http://www.lyngsat-logo.com/tvcountry/%s.html' % country
    print 'Fetching %s...' % baseurl
    page = urllib2.urlopen(baseurl).read()
    #page = urllib2.urlopen('file:///home/russ/src/atropine/%s.html' % country).read()

    print 'Processing...'
    soup = BeautifulSoup(page)

    for td in soup.find_all('td'):
        a = td.find_all('a')
        if len(a) == 2:
            try:
                name = a[1].text
                icons[name] = urljoin(baseurl, a[0].find_all('img')[0]['src'])
                icons_normalized[normalize(name)] = name
            except:
                pass

 def match(callsign, network, method):
    callsigns[callsign] = icons_normalized[network]
    print '%s [%s]' % (icons_normalized[network], method)

 print 'Building chansign mapping...'
 lower_keys = dict()
 for n in icons_normalized.keys():
    lower_keys[n.lower()] = n

 callsigns = dict()
 for station in et.stations.values():

    text = '%s/%s...' % (station.callSign, station.name)
    print '%-60s' % text,

    # Strip off anything in parens
    name = re.sub('\(.*\)', '', station.name).strip()
    name = normalize(name)

    if name in icons_normalized:
        match(station.callSign, name, 'exact')
        continue

    try:
        match(station.callSign, lower_keys[name.lower()], 'case')
        continue
    except:
        pass

    if station.callSign in icons_normalized:
        match(station.callSign, station.callSign, 'exact(callsign)')
        continue

    try:
        match(station.callSign, lower_keys[station.callSign.lower()], 'case(callsign)')
        continue
    except:
        pass

    result = fuzzywuzzy.process.extractOne(name, icons_normalized.keys(), score_cutoff=95)
    if result:
        match(station.callSign, result[0], 'fuzz=%d%%' % result[1])
        continue

    munges = ['DT', 'HD', 'TV', 'CD', 'SD']
    munge = None
    for m in munges:
        if m in name:
            munge = m
            munges.remove(m)
            munges.append('')
            break

    munged = False
    for to in munges:
        if munge:
            munged_name = name.replace(m, to)
        else:
            munged_name = name + ' ' + to
        result = fuzzywuzzy.process.extractOne(munged_name, icons_normalized.keys(), score_cutoff=95)
        if result:
            if munge is None:
               munge = '<None>'
            if to is '':
               to = '<None>'
            match(station.callSign, result[0], 'munge %s to %s, fuzz=%d%%' % (munge, to, result[1]))
            munged = True
            break
    if munged:
        continue

    tokens = name.split()
    suffix = None
    if tokens[-1] in ('DT', 'HD', 'CD', 'SD'):
        suffix = tokens[-1]
        del tokens[-1]

    if len(tokens) > 2:
        abbrev = ''.join([x[0] for x in tokens])
        result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
        if result:
            match(station.callSign, result[0], 'abbrev, fuzz=%d%%' % result[1])
            continue

    if len(tokens) > 1 and 'TV' in tokens:
        abbrev = ''.join([x if x == 'TV' else x[0] for x in tokens])
        result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
        if result:
            match(station.callSign, result[0], 'abbrev w/TV, fuzz=%d%%' % result[1])
            continue

    print 'no match'

 prefix = os.path.commonprefix(icons.values()).rstrip('/')

 strip_len = len(prefix)
 base = '[' + mapname + ']'

 outfile = 'iconmap-%s.xml' % mapname
 print 'Saving %s, %d icon mappings found' % (outfile, len(icons))
 top = Element('iconmappings')

 for callsign, network in callsigns.iteritems():
    c1 = SubElement(top, 'callsigntonetwork')
    SubElement(c1, 'callsign').text = callsign
    SubElement(c1, 'network').text = network

 for network, url in icons.iteritems():
    c1 = SubElement(top, 'networktourl')
    SubElement(c1, 'network').text = network
    SubElement(c1, 'url').text = base + url[strip_len:]

 c1 = SubElement(top, 'baseurl')
 SubElement(c1, 'stub').text = mapname
 SubElement(c1, 'url').text = prefix

 ElementTree(top).write(outfile, encoding='utf-8', xml_declaration=True)
	#!/usr/bin/env python

	from urlparse import urljoin
	from bs4 import BeautifulSoup
	from xml.etree.ElementTree import Element, SubElement, ElementTree

	import urllib2
	import os.path
	import sys
	import guide_data
	import re
	import fuzzywuzzy.process

	def normalize(name):
	name = re.sub('Television', 'TV', name)
	name = re.sub('USA', 'US', name)
	name = re.sub('[ -]+(East\|West\|Pacific\|Atlantic)$', '', name)
	name = re.sub('([^ -])(DT\|HD\|TV\|CD\|SD)(\d*)( \|$)', '\g<1> \g<2>\g<3>\g<4>', name)

	return name

	# 2 letter country code or 2 letter country - state, eg us-ny
	countries = sys.argv[1:]
	if not len(countries):
	countries = ['us']

	mapname = 'LyngSat'

	print 'Opening guide data...'
	et = guide_data.guide_data()

	icons = dict()
	icons_normalized = dict()

	for country in countries:
	baseurl = 'http://www.lyngsat-logo.com/tvcountry/%s.html' % country
	print 'Fetching %s...' % baseurl
	page = urllib2.urlopen(baseurl).read()
	#page = urllib2.urlopen('file:///home/russ/src/atropine/%s.html' % country).read()

	print 'Processing...'
	soup = BeautifulSoup(page)

	for td in soup.find_all('td'):
	a = td.find_all('a')
	if len(a) == 2:
	try:
	name = a[1].text
	icons[name] = urljoin(baseurl, a[0].find_all('img')[0]['src'])
	icons_normalized[normalize(name)] = name
	except:
	pass

	def match(callsign, network, method):
	callsigns[callsign] = icons_normalized[network]
	print '%s [%s]' % (icons_normalized[network], method)

	print 'Building chansign mapping...'
	lower_keys = dict()
	for n in icons_normalized.keys():
	lower_keys[n.lower()] = n

	callsigns = dict()
	for station in et.stations.values():

	text = '%s/%s...' % (station.callSign, station.name)
	print '%-60s' % text,

	# Strip off anything in parens
	name = re.sub('\(.*\)', '', station.name).strip()
	name = normalize(name)

	if name in icons_normalized:
	match(station.callSign, name, 'exact')
	continue

	try:
	match(station.callSign, lower_keys[name.lower()], 'case')
	continue
	except:
	pass

	if station.callSign in icons_normalized:
	match(station.callSign, station.callSign, 'exact(callsign)')
	continue

	try:
	match(station.callSign, lower_keys[station.callSign.lower()], 'case(callsign)')
	continue
	except:
	pass

	result = fuzzywuzzy.process.extractOne(name, icons_normalized.keys(), score_cutoff=95)
	if result:
	match(station.callSign, result[0], 'fuzz=%d%%' % result[1])
	continue

	munges = ['DT', 'HD', 'TV', 'CD', 'SD']
	munge = None
	for m in munges:
	if m in name:
	munge = m
	munges.remove(m)
	munges.append('')
	break

	munged = False
	for to in munges:
	if munge:
	munged_name = name.replace(m, to)
	else:
	munged_name = name + ' ' + to
	result = fuzzywuzzy.process.extractOne(munged_name, icons_normalized.keys(), score_cutoff=95)
	if result:
	if munge is None:
	munge = '<None>'
	if to is '':
	to = '<None>'
	match(station.callSign, result[0], 'munge %s to %s, fuzz=%d%%' % (munge, to, result[1]))
	munged = True
	break
	if munged:
	continue

	tokens = name.split()
	suffix = None
	if tokens[-1] in ('DT', 'HD', 'CD', 'SD'):
	suffix = tokens[-1]
	del tokens[-1]

	if len(tokens) > 2:
	abbrev = ''.join([x[0] for x in tokens])
	result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
	if result:
	match(station.callSign, result[0], 'abbrev, fuzz=%d%%' % result[1])
	continue

	if len(tokens) > 1 and 'TV' in tokens:
	abbrev = ''.join([x if x == 'TV' else x[0] for x in tokens])
	result = fuzzywuzzy.process.extractOne(abbrev, icons_normalized.keys(), score_cutoff=95)
	if result:
	match(station.callSign, result[0], 'abbrev w/TV, fuzz=%d%%' % result[1])
	continue

	print 'no match'

	prefix = os.path.commonprefix(icons.values()).rstrip('/')

	strip_len = len(prefix)
	base = '[' + mapname + ']'

	outfile = 'iconmap-%s.xml' % mapname
	print 'Saving %s, %d icon mappings found' % (outfile, len(icons))
	top = Element('iconmappings')

	for callsign, network in callsigns.iteritems():
	c1 = SubElement(top, 'callsigntonetwork')
	SubElement(c1, 'callsign').text = callsign
	SubElement(c1, 'network').text = network

	for network, url in icons.iteritems():
	c1 = SubElement(top, 'networktourl')
	SubElement(c1, 'network').text = network
	SubElement(c1, 'url').text = base + url[strip_len:]

	c1 = SubElement(top, 'baseurl')
	SubElement(c1, 'stub').text = mapname
	SubElement(c1, 'url').text = prefix

	ElementTree(top).write(outfile, encoding='utf-8', xml_declaration=True)