Last active
August 30, 2019 14:03
-
-
Save Maxime-Favier/51f5a6892d54ffa60f3cb90cfeb4e67c to your computer and use it in GitHub Desktop.
noelfic scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen | |
import bs4 as BeautifulSoup | |
import time | |
import re | |
import datetime | |
import psycopg2 | |
import pickle | |
import sys | |
try: | |
conn = psycopg2.connect("dbname='scrapper' user='postgres' host='localhost' password='root'") | |
except: | |
raise ConnectionError | |
cur = conn.cursor() | |
with open('donnees.txt', 'rb') as fichier: | |
mon_depickler = pickle.Unpickler(fichier) | |
lien = mon_depickler.load() | |
#print(len(lien)) | |
#lien = lien[0:2] | |
for count, url in enumerate(lien): | |
print(str(count+1)+ " / 2461") | |
html = urlopen("https://www.noelfic.fr" + str(url)).read() | |
soup = BeautifulSoup.BeautifulSoup(html, "lxml") | |
# ############# # | |
# FIRST CHAPTER # | |
# ############# # | |
# print(re.search('fic/(.+?)-', url[0]).group(1)) | |
oldId = int(re.search('%s(.+?)%s' % ("fic/", "-"), url).group(1)) | |
#print(oldId) | |
noteFinder = soup.find("div", attrs={"class": u"right-align"}) | |
note = len(noteFinder.find_all("img")) | |
#print(note) # Note de la fic | |
chapFinder = soup.find_all("script") | |
chapFinder = str(chapFinder[2].get_text()) | |
chapitre = int(re.search('%s(.*)%s' % ("var totalChapters = ", ";function handleChapterKey"), chapFinder).group(1)) | |
#print(chapitre) # nombre de chapitres dans la fic | |
infoFinder = soup.find("div", attrs={"class": u"center-align"}) | |
h4 = infoFinder.find_all("h4") | |
titre = h4[0].get_text() | |
print(titre) # Titre de la fic | |
paragraphe = infoFinder.find("p") | |
paragraphe = paragraphe.get_text() | |
auteurListe = re.search('%s(.*)%s' % ("Par : ", "Genre : "), paragraphe).group(1) | |
#print(auteurListe) # auteurs de la fic | |
genre = re.search('%s(.*)%s' % ("Genre : ", "Statut : "), paragraphe).group(1) | |
#print(genre) # genre de la fic | |
statut = re.search('%s(.*)' % ("Statut : ",), paragraphe).group(1) | |
#print(statut) # statut de la fic | |
AllInfo = [ | |
{"titre": titre, "auteurs": auteurListe, "genre": genre, "statut": statut, "chapitres": chapitre, "note": note, | |
"oldId": oldId}] | |
#print(AllInfo) | |
cur.executemany( | |
"""INSERT INTO fics | |
( uuid, titre, auteurs, genre, statut, chapitres, note, oldId) | |
VALUES | |
(uuid_generate_v1mc(), %(titre)s, %(auteurs)s, %(genre)s, %(statut)s, %(chapitres)s, %(note)s, %(oldId)s)""", | |
AllInfo) | |
conn.commit() | |
allchapitres = [] | |
for i in range(1, chapitre + 1): | |
nexturlFinder = url.split("/") | |
nexturl = str(nexturlFinder[1]) + "/" + str(nexturlFinder[2] + "/" + str(i)) | |
html = urlopen("https://www.noelfic.fr/" + nexturl).read() | |
time.sleep(8) | |
soup = BeautifulSoup.BeautifulSoup(html, "lxml") | |
infoFinder = soup.find("div", attrs={"class": u"center-align"}) | |
h4 = infoFinder.find_all("h4") | |
chapName = h4[1].get_text() | |
sys.stdout.write(chapName) # Numéro du chapitre | |
sys.stdout.flush() | |
datetFinder = soup.find("p", attrs={"class": u"left-align"}) | |
date = re.search('%s(.*)%s' % ("Publié le ", " à "), datetFinder.get_text()).group(1) | |
heure = re.search('%s(.*)%s' % (" à ", " par "), datetFinder.get_text()).group(1) | |
datetimechap = datetime.datetime.strptime(date + " " + heure, '%d/%m/%Y %H:%M:%S') | |
# print(datetimechap) # heure de publication | |
currentauteur = datetFinder.find("b").get_text() | |
# print(currentauteur) # auteur du chapitre | |
contentFinder = soup.find("div", attrs={"class": u"card grey lighten-2"}) | |
contentFinder = contentFinder.find("div", attrs={"class": u"card-content"}) | |
content = str(contentFinder).replace('<div class="card-content">', "").replace('</div>', "").replace("\n", | |
"").replace( | |
"\r", "").replace("\t", "") | |
# print(content) # contenu de la fic | |
allchapitres.append({"oldId": oldId, "chapitreTitle": chapName, "datetime": datetimechap, "cAuteur": currentauteur, | |
"content": content}) | |
# ######## # | |
# COMMENTS # | |
# ######## # | |
commentFinder = soup.find_all("li", attrs={"class": u"collection-item avatar"}) | |
allComment = [] | |
for comment in commentFinder: | |
# print(comment) | |
pseudo = comment.find("a").get_text() | |
# print(pseudo) | |
commentget = comment.find_all("p") | |
commentDate = datetime.datetime.strptime(commentget[0].get_text(), '%d/%m/%Y') | |
# print(commentDate) | |
commentContent = str(commentget[1]).replace("<p>", "").replace("</p>", "") | |
# print(commentContent) | |
allComment.append( | |
{"pseudo": pseudo, "datetime": commentDate, "comment": commentContent, "oldId": oldId, "page": i}) | |
cur.executemany( | |
"""INSERT INTO commentaires | |
( pseudo, dateecr, commentaire, idfic, page) | |
VALUES | |
( %(pseudo)s, %(datetime)s, %(comment)s, %(oldId)s, %(page)s)""", | |
allComment) | |
conn.commit() | |
sys.stdout.write('\n import..\n') | |
sys.stdout.flush() | |
cur.executemany( | |
"""INSERT INTO chapitres | |
( oldid, title, datetime, auteur, content) | |
VALUES | |
( %(oldId)s, %(chapitreTitle)s, %(datetime)s, %(cAuteur)s, %(content)s)""", | |
allchapitres) | |
conn.commit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen | |
import bs4 as BeautifulSoup | |
import time | |
import pickle | |
site = "https://www.noelfic.fr/classement/date/" | |
url = [] | |
for i in range(1, 125): | |
html = urlopen(site + str(i)).read() | |
print(str(i) + "/124") | |
soup = BeautifulSoup.BeautifulSoup(html, "lxml") | |
table = soup.find('tbody') | |
rows = table.find_all('tr') | |
for row in rows: | |
#print("#################") | |
cols = row.find_all('td') | |
t = cols[0].find('a') | |
url.append(t.get('href')) | |
#print(t.get('href')) | |
#titre = str(t.get('title')) | |
#auteur = str(cols[1].get_text()).replace("\n", "") | |
#date = str(cols[2].get_text()) | |
#statut = str(cols[3].get_text()) | |
#note = len(cols[4].find_all('img')) | |
print("waiting...") | |
time.sleep(9.5) | |
with open('donnees.txt', 'wb') as fichier: | |
mon_pickler = pickle.Pickler(fichier) | |
mon_pickler.dump(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
"Python"