Skip to content

Instantly share code, notes, and snippets.

@redutan
Last active September 21, 2017 07:54
Show Gist options
  • Save redutan/0cbb49409c95bdb99347ec6e385971ec to your computer and use it in GitHub Desktop.
Save redutan/0cbb49409c95bdb99347ec6e385971ec to your computer and use it in GitHub Desktop.
top_instagram_hashtags_crawler
requests==2.18.4
BeautifulSoup4==4.6.0
lxml==3.8.0
import requests
from bs4 import BeautifulSoup
import re
def crawl(max_pages):
page = 0
while page < max_pages:
url = 'https://top-hashtags.com/instagram/' + str(page * 100 + 1) + '/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
for tagLink in soup.select('div.tht-tag > a'):
tag_string = tagLink.string[1:]
if not re.match('[가-힣]+', tag_string):
continue
if '그램' in tag_string:
continue
print(tag_string)
page += 1
crawl(1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment