Skip to content

Instantly share code, notes, and snippets.

@ismailmazumder
Created December 25, 2024 20:14
Show Gist options
  • Save ismailmazumder/adff07353eda3c0025d14999fba8cf25 to your computer and use it in GitHub Desktop.
Save ismailmazumder/adff07353eda3c0025d14999fba8cf25 to your computer and use it in GitHub Desktop.
web scraping bangla prothom alo
import bs4,os,requests,json,csv,re
with open('output.csv', mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
# Write header (optional)
writer.writerow(["instruction", "input","story"])
links = [
"https://www.prothomalo.com/api/v1/collections/latest",
"https://www.prothomalo.com/api/v1/collections/politics",
"https://www.prothomalo.com/api/v1/collections/bangladesh",
"https://www.prothomalo.com/api/v1/collections/bangladesh/crime",
"https://www.prothomalo.com/api/v1/collections/world",
"https://www.prothomalo.com/api/v1/collections/business",
"https://www.prothomalo.com/api/v1/collections/opinion",
"https://www.prothomalo.com/api/v1/collections/sports",
"https://www.prothomalo.com/api/v1/collections/chakri",
"https://www.prothomalo.com/api/v1/collections/entertainment",
"https://www.prothomalo.com/api/v1/collections/lifestyle",
"https://nagorik.prothomalo.com/api/v1/collections/nagorik-sangbad-more",
"https://nagorik.prothomalo.com/api/v1/collections/durporobash-life-style",
"https://nagorik.prothomalo.com/api/v1/collections/ayojon-nagorik-sangbad",
"https://nagorik.prothomalo.com/api/v1/collections/prosperity-nagorik-sangbad",
"https://nagorik.prothomalo.com/api/v1/collections/reader-nagorik-sangbad",
"https://nagorik.prothomalo.com/api/v1/collections/arts-nagorik-sangbad",
"https://nagorik.prothomalo.com/api/v1/collections/travel-nagorik-sangbad",
"https://www.bigganchinta.com/api/v1/collections/bigganchinta-latest",
"https://www.bondhushava.com/api/v1/collections/writings-bondhushava",
"https://www.bondhushava.com/api/v1/collections/activities-bondhushava",
"https://www.kishoralo.com/api/v1/collections/kia-latest",
"https://www.kishoralo.com/api/v1/collections/kobita-kishoralo",
"https://www.kishoralo.com/api/v1/collections/interview-kishoralo",
"https://www.kishoralo.com/api/v1/collections/sports-kishoralo",
"https://www.kishoralo.com/api/v1/collections/feature-kishoralo",
"https://www.kishoralo.com/api/v1/collections/entertainment-kishoralo",
"https://www.kishoralo.com/api/v1/collections/lifestyle-kishoralo",
"https://www.kishoralo.com/api/v1/collections/letter-kishoralo",
"https://www.kishoralo.com/api/v1/collections/other-kishoralo",
"https://www.haal.fashion/api/v1/collections/fashion-haalfashion"
]
def main(link):
count = 0
golpo_count = 0
for new in range(0,100):
params = {'offset': f'{count}','limit': f'{40}',}
response = requests.get(str(link),params=params,)
try:
try:
jss = response.json()
except Exception as e:
print(e,"json")
for new_ in range(0,41):
try:
link = jss['items'][new_]['story']['url']
title__ = f"{str(jss['items'][new_]['item']['headline'][0])}"
code = requests.get(link)
soup = bs4.BeautifulSoup(code.text,'html.parser')
# idk titile real title upore
title = soup.find_all('script', {'id' : 'static-page'})
for new in title:
js = new.string.strip()
js = json.loads(js)
story = ''
for store in js['qt']['data']['story']['cards']:
story=story + str(store['story-elements'][0]['text'].strip("<p>"))
# print(store['story-elements'][0]['text'])
story = re.sub(r"<p\/?>", "", story)
story = re.sub(r"\s+", " ", story)
final = ["গল্প লিখো আমার বলার বিষয় টি নিয়ে ",title__,story.strip()]
with open('output.csv', 'a') as file:
writer = csv.writer(file)
# Write each row
writer.writerow(final)
golpo_count = golpo_count + 1
except Exception as e:
print(e)
except Exception as e:
print(e)
count = count + 10
print(params,golpo_count)
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=31) as executor:
executor.map(main, links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment