Last active
February 21, 2017 05:58
-
-
Save kenzotakahashi/390552e85c05af4c39112dcb116fa969 to your computer and use it in GitHub Desktop.
HTML elementのXpathを取得するスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import request | |
from lxml import etree | |
import re | |
def get_index(e): | |
tag = e.tag | |
prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag] | |
next_list = [i for i in e.itersiblings() if i.tag == tag] | |
if len(prev_list + next_list) == 0: | |
return None | |
return len(prev_list) + 1 | |
def is_valid_class(c, siblings): | |
if re.search(r'[0-9]', c): | |
return False | |
c = c.strip() | |
for sibling in siblings: | |
if c in sibling: | |
return False | |
return True | |
def get_one_path(e): | |
index = get_index(e) | |
index = "[%s]" % (index) if index else "" | |
this_attrib = e.attrib | |
if 'id' in this_attrib: | |
val = this_attrib['id'] | |
if not re.search(r'[0-9]', val): | |
return e.tag + "[@id='%s']" % (val) | |
if 'class' in this_attrib: | |
## 同じタグで同じクラスのものがsiblingにない場合のみclassを使用 | |
tag = e.tag | |
prev_list = [i for i in e.itersiblings(preceding=True) if i.tag == tag and 'class' in i.attrib] | |
next_list = [i for i in e.itersiblings() if i.tag == tag and 'class' in i.attrib] | |
siblings = [e.attrib['class'].split(' ') for e in prev_list + next_list] | |
class_list = this_attrib['class'].split(' ') | |
for c in class_list: | |
if is_valid_class(c, siblings): | |
return e.tag + "[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (c) | |
return e.tag + index | |
def get_xpath(e): | |
my_xpath = '' | |
while True: | |
path = get_one_path(e) | |
my_xpath = "/%s%s" % (path, my_xpath) | |
e = e.getparent() | |
# root tagまでたどり着いた | |
if e is None: | |
return my_xpath | |
url = 'https://rent.tokyu-housing-lease.co.jp/rent/8016671/6337' | |
def main(): | |
with request.urlopen(url) as f: | |
data = f.read().decode('utf-8') | |
tree = etree.HTML(data) | |
given_path = "/html/body[@id='diamondtail']/div[@id='wrap']/div[@id='contents_wrap']/div[@id='contents']/div[@id='contents_inner']/div[@id='article']/div[@id='item_detail']/div/table[contains(concat(' ', normalize-space(@class), ' '), ' item_table ')]/tr[5]/th[1]" | |
p = tree.xpath(given_path)[0] | |
print(get_xpath(p) == given_path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment