Created
July 26, 2016 10:38
-
-
Save pontikos/1635ddc88cb26c132b0b42592376e377 to your computer and use it in GitHub Desktop.
genecards python scraper uses selenium and phantomjs to circumvent Incapsula
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import sys | |
import re | |
from selenium import webdriver | |
from random import randint | |
from time import sleep | |
dr = webdriver.PhantomJS() | |
#dr.get('http://www.genecards.org') | |
p='' | |
for l in sys.stdin: | |
l=l.strip() | |
while 'ENSG' not in p: | |
dr.get('http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s' % l) | |
dr.get_cookies() | |
p=dr.page_source | |
sleep(randint(1,10)) | |
print(l, re.compile('gene=(ENSG.*?)"').search(p).group(1)) | |
p='' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python genecards.py < genes.txt
where genes.txt: