Last active
January 15, 2024 19:23
-
-
Save yash201040/e35d5e65b9797f19c06600226b85e4ee to your computer and use it in GitHub Desktop.
Get Nasdaq100 stocks trade data for trailing 2 years from polygon.io
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from datetime import timedelta | |
import pandas as pd | |
import pandas_market_calendars as mcal | |
import requests | |
import time | |
# ---------------------------------------------- | |
# Scrape NASDAQ 100 stock ticker names from wiki | |
# ---------------------------------------------- | |
def get_nasdaq_100_tickers(): | |
# Create soup object of the target html page | |
url = 'https://en.wikipedia.org/wiki/Nasdaq-100' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'lxml') | |
# Find the table of rows containing NASDAQ 100 constituents | |
table = soup.find('table', {'id': 'constituents'}) | |
rows = table.tbody.find_all('tr') | |
# Create a list to store tickers | |
tickers = [] | |
for row in rows[1:]: # Skip the header row | |
# Access the ticker from row data (in 2nd column) | |
tds = row.find_all('td')[1] | |
# Append ticker to list | |
tickers.append(tds.text.strip()) | |
# Remove class-A ticker for Alphabet as class-C (GOOG) already exists | |
tickers.remove('GOOGL') | |
return tickers | |
# Get NASDAQ 100 tickers | |
tickers = get_nasdaq_100_tickers() | |
# Print number of tickers collected | |
print(f'Tickers Found: {len(tickers)}') ## Tickers Found: 100 | |
# ------------------------------------------- | |
# Get valid trading days for trailing 2 years | |
# ------------------------------------------- | |
# Get all calendar days for New York Stock Exchange (NYSE) | |
nyse = mcal.get_calendar('NYSE') | |
# Set the latest end date in US Eastern Time Zone (ET) | |
end_date = pd.Timestamp.now(tz='US/Eastern') | |
# Set start date 2 years behind the end date in the same timezone | |
start_date = (end_date - pd.DateOffset(years=2)).tz_convert('US/Eastern') | |
# Get valid trading days for the last 2 years but don't specify a timezone here | |
trading_days = nyse.valid_days(start_date=start_date.tz_localize(None), end_date=end_date.tz_localize(None)) | |
# Then convert the timezone-aware result to 'US/Eastern' | |
trading_days = trading_days.tz_convert('US/Eastern') | |
# Print the formatted number of trading days of trailing 2 years | |
print(f'No. of trading days in trailing 2 years: {len(trading_days)}') | |
# ------------------------------- | |
# Define a function to fetch data | |
# ------------------------------- | |
# Function to fetch data for a ticker within given date range | |
def fetch_data(ticker, start_date, end_date): | |
url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start_date}/{end_date}" | |
params = {"sort": "asc", "limit": 50000} | |
headers = {"Authorization": "Bearer gwetlM945TUXEqGt_peMizmmLyg3U6fS"} | |
response = requests.get(url, params=params, headers=headers) | |
data = response.json()['results'] | |
df = pd.DataFrame(data) | |
return df | |
# -------------------------------------------------- | |
# Get data for each ticker and save it to a csv file | |
# -------------------------------------------------- | |
# Initialize variables to track progress | |
start_time = time.time() | |
total_api_calls = 0 | |
for t in range(100): | |
# Select a ticker | |
ticker = tickers[t] | |
# Initialize an empty DataFrame for the ticker | |
df_all = pd.DataFrame() | |
# Loop through trading days with a window of 52 days because the aggregates max limit is 50,000 data points per api request | |
for day in range(0, len(trading_days), 52): | |
# Select date range for a span of 52 days | |
start_date = trading_days[day].date().isoformat() | |
end_date = trading_days[min(day + 52, len(trading_days) - 1)].date().isoformat() | |
# Fetch data for a ticker in that date range | |
df = fetch_data(ticker, start_date, end_date) | |
# Increment total api calls by 1 | |
total_api_calls += 1 | |
# Append the data in the ticker's data frame | |
df_all = pd.concat([df_all, df]) | |
# Sleep for 12 seconds to stay within API rate limits (5 requests per minute) | |
time.sleep(12) | |
# Print progress update | |
elapsed_time = time.time() - start_time | |
print(f"\rProcessing ticker {t+1}/100: {ticker} || API Calls: {total_api_calls}/1000 || Elapsed Time: {str(timedelta(seconds=int(elapsed_time)))}", end="") | |
# Write ticker data to CSV | |
df_all.to_csv(f"{ticker}.csv", index=False) | |
print(f'Trailing 2 years trade data was successfully fetched for {total_api_calls//10} out of 100 stocks and total api calls made were: {total_api_calls}') | |
## OUTPUT: Processing ticker 100/100: ZS || API Calls: 960/1000 || Elapsed Time: 7:43:31 | |
## Trailing 2 years trade data was successfully fetched for 96 out of 100 stocks and total api calls made were: 960 | |
# ----------------------------------------------------- E N D ----------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment