Last active
August 31, 2021 00:13
-
-
Save kaustumbh7/3b1ae17c192e0853b0facd78f2407462 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import libraries | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
# specify the url | |
url = "https://www.bbc.com/sport/football/46897172" | |
# Connect to the website and return the html to the variable ‘page’ | |
try: | |
page = urlopen(url) | |
except: | |
print("Error opening the URL") | |
# parse the html using beautiful soup and store in variable `soup` | |
soup = BeautifulSoup(page, 'html.parser') | |
# Take out the <div> of name and get its value | |
content = soup.find('div', {"class": "story-body sp-story-body gel-body-copy"}) | |
article = '' | |
for i in content.findAll('p'): | |
article = article + ' ' + i.text | |
print(article) | |
# Saving the scraped text | |
with open('scraped_text.txt', 'w') as file: | |
file.write(article) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Update line 18
content = soup.find('div', {"class": "qa-story-body story-body gel-pica gel-10/12@m gel-7/8@l gs-u-ml0@l gs-u-pb++"})