kenzotakahashi · February 21, 2017 05:58
diff --git a/get_xpath.py b/get_xpath.py
 from urllib import request
 from lxml import etree
 import re

 def get_index(e):
 	tag = e.tag
 	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag]
 	next_list = [i for i in e.itersiblings() if i.tag == tag]
 	if len(prev_list + next_list) == 0:
 		return None
 	return len(prev_list) + 1

 def is_valid_class(c, siblings):	
 	if re.search(r'[0-9]', c):
 		return False
 	c = c.strip()
 	for sibling in siblings:
 		if c in sibling:
 			return False
 	return True

 def get_one_path(e):
 	index = get_index(e)
 	index = "[%s]" % (index) if index else ""
 	this_attrib = e.attrib
 	if 'id' in this_attrib:
 		val = this_attrib['id']
 		if not re.search(r'[0-9]', val):
 			return e.tag + "[@id='%s']" % (val)
 	if 'class' in this_attrib:
 		## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用
 		tag = e.tag
 		prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib]
 		next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib]
 		siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list]
 		class_list = this_attrib['class'].split(' ')
 		for c in class_list:
 			if is_valid_class(c, siblings):
 				return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c)
 	return e.tag + index

 def get_xpath(e):
 	my_xpath = ''
 	while True:
 		path = get_one_path(e)
 		my_xpath = "/%s%s" % (path, my_xpath)
 		e = e.getparent()
 		# root tagまでたどり着いた
 		if e is None:
 			return my_xpath

 url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337'
 def main():
 	with request.urlopen(url) as f:
 		data = f.read().decode('utf-8')		
 		tree = etree.HTML(data)
 		given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]"
 		p = tree.xpath(given_path)[0]
 		print(get_xpath(p) == given_path)

 if __name__ == '__main__':
 	main()
	from urllib import request
	from lxml import etree
	import re

	def get_index(e):
	tag = e.tag
	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag]
	next_list = [i for i in e.itersiblings() if i.tag == tag]
	if len(prev_list + next_list) == 0:
	return None
	return len(prev_list) + 1

	def is_valid_class(c, siblings):
	if re.search(r'[0-9]', c):
	return False
	c = c.strip()
	for sibling in siblings:
	if c in sibling:
	return False
	return True

	def get_one_path(e):
	index = get_index(e)
	index = "[%s]" % (index) if index else ""
	this_attrib = e.attrib
	if 'id' in this_attrib:
	val = this_attrib['id']
	if not re.search(r'[0-9]', val):
	return e.tag + "[@id='%s']" % (val)
	if 'class' in this_attrib:
	## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用
	tag = e.tag
	prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib]
	next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib]
	siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list]
	class_list = this_attrib['class'].split(' ')
	for c in class_list:
	if is_valid_class(c, siblings):
	return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c)
	return e.tag + index

	def get_xpath(e):
	my_xpath = ''
	while True:
	path = get_one_path(e)
	my_xpath = "/%s%s" % (path, my_xpath)
	e = e.getparent()
	# root tagまでたどり着いた
	if e is None:
	return my_xpath

	url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337'
	def main():
	with request.urlopen(url) as f:
	data = f.read().decode('utf-8')
	tree = etree.HTML(data)
	given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]"
	p = tree.xpath(given_path)[0]
	print(get_xpath(p) == given_path)

	if __name__ == '__main__':
	main()