Skip to content

Instantly share code, notes, and snippets.

@reservoirinvest
Last active November 27, 2017 12:55
Show Gist options
  • Save reservoirinvest/8cdcc05b32590edcc2f54cea4cd5b8b4 to your computer and use it in GitHub Desktop.
Save reservoirinvest/8cdcc05b32590edcc2f54cea4cd5b8b4 to your computer and use it in GitHub Desktop.
Scrape from URLs # pandas
########################################################
#### Scrape a table with an known index into pandas
########################################################
import pandas as pd
## Scrape a single table from an URL with table index
symlotmarginurl = "https://www.5paisa.com/5pit/spma.asp"
symlotmargin = pd.read_html(symlotmarginurl)[1] # It's the second table
########################################################
#### Scrape a table with an id into pandas
########################################################
import requests
scrip = 'PFC'
exp_date = '28DEC2017'
url = "https://www.nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?&instrument=OPTSTK&symbol=" \
+ scrip + "&date="+ exp_date
# Options table
html = requests.get(url).content
nsetable = pd.read_html(html, attrs = {'id':'octable'}, header=1)[-1][:-1].drop(['Chart', 'Chart.1'], 1)
########################################################
#### Scrape a value from JSON generated URL
########################################################
import requests
from bs4 import BeautifulSoup
import json
url = "https://www.nseindia.com/live_market/dynaContent/live_watch/get_quote/GetQuoteFO.jsp?underlying=PFC&instrument=FUTSTK"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
data=soup.find(id="responseDiv").text.strip()
d1 = json.loads(data)
d2=float(d1['data'][0]['annualisedVolatility'])
d2
#########################################################
#### Scrape dividend from Google Finance page
#########################################################
from lxml import etree, html
import requests
url = "https://finance.google.com/finance?q=NSE:PFC"
page = requests.get(url)
root = html.fromstring(page.content)
dividend = float(root.findall('.//table')[2].text_content().strip().split("\n")[2].split('/')[0])
dividend
########################################################
#### Scrape a value from an element tree
########################################################
import requests
import lxml.html
response = requests.get('http://www.un.org/en/sc/documents/resolutions/2016.shtml')
tree = lxml.html.HTML(response.text)
title_elem = tree.xpath('//title')[0]
title_elem = tree.cssselect('title')[0] # equivalent to previous XPath
print("title tag:", title_elem.tag)
print("title text:", title_elem.text_content())
print("title html:", lxml.html.tostring(title_elem))
print("title tag:", title_elem.tag)
print("title's parent's tag:", title_elem.getparent().tag
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment