Created
July 27, 2019 11:08
-
-
Save Sergeydigl3/cc5e890b38486d0d00f7f4a65230f93a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #@title Download smth | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from pathlib import Path | |
| from os.path import join | |
| from os import chdir, mkdir | |
| import PIL.Image as Image | |
| from itertools import groupby | |
| main_path = '/content/' | |
| main_name = 'czech' | |
| pages = 37 | |
| chdir(main_path) | |
| if not Path(main_name).exists(): | |
| mkdir(main_name) | |
| chdir(main_name) | |
| #@markdow Start from page | |
| page=1 #@param {type: "number"} | |
| while page <= pages: | |
| soup = BeautifulSoup( | |
| requests.get('https://czechcasting.adultsites.co/page/{}/'.format(page)).content,'lxml') | |
| soup = soup.find('div', {'class':'posts'}) | |
| soup = soup.find_all('div', {'class': 'post'}) | |
| persons = [] | |
| for person in soup: | |
| persons.append(person.find('a')['href']) | |
| persons = [el for el, _ in groupby(persons)] | |
| print(persons) | |
| for person_page in persons: | |
| person_dir = person_page.split('/')[-2] | |
| print(person_dir) | |
| if not Path(person_dir).exists(): | |
| mkdir(person_dir) | |
| chdir(person_dir) | |
| person_data = requests.get(person_page).content | |
| person_pars = BeautifulSoup(person_data, 'lxml') | |
| person_pars = person_pars.find('div', {'id': 'gallery-2'}) | |
| person_pars = person_pars.find_all('img') | |
| person_imgs = [] | |
| for image in person_pars: | |
| name = image['src'].split('/')[-1] | |
| new_name = name.split('-300x400.jpg')[0]+'.jpg' | |
| person_imgs.append({'url':image['src'].replace(name, new_name), | |
| 'name': new_name}) | |
| #downloading | |
| for img in person_imgs: | |
| r = requests.get(img['url']) | |
| response = requests.get(img['url']) | |
| with open(img['name'], 'wb') as f: | |
| f.write(response.content) | |
| #Here should be labling | |
| chdir('../') | |
| print('\n\nCheckpoint: {} page\n\n'.format(page)) | |
| page=page+1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment