Skip to content

Instantly share code, notes, and snippets.

@Maxime-Favier
Last active August 30, 2019 14:03
Show Gist options
  • Save Maxime-Favier/51f5a6892d54ffa60f3cb90cfeb4e67c to your computer and use it in GitHub Desktop.
Save Maxime-Favier/51f5a6892d54ffa60f3cb90cfeb4e67c to your computer and use it in GitHub Desktop.
noelfic scrapper
from urllib.request import urlopen
import bs4 as BeautifulSoup
import time
import re
import datetime
import psycopg2
import pickle
import sys
try:
conn = psycopg2.connect("dbname='scrapper' user='postgres' host='localhost' password='root'")
except:
raise ConnectionError
cur = conn.cursor()
with open('donnees.txt', 'rb') as fichier:
mon_depickler = pickle.Unpickler(fichier)
lien = mon_depickler.load()
#print(len(lien))
#lien = lien[0:2]
for count, url in enumerate(lien):
print(str(count+1)+ " / 2461")
html = urlopen("https://www.noelfic.fr" + str(url)).read()
soup = BeautifulSoup.BeautifulSoup(html, "lxml")
# ############# #
# FIRST CHAPTER #
# ############# #
# print(re.search('fic/(.+?)-', url[0]).group(1))
oldId = int(re.search('%s(.+?)%s' % ("fic/", "-"), url).group(1))
#print(oldId)
noteFinder = soup.find("div", attrs={"class": u"right-align"})
note = len(noteFinder.find_all("img"))
#print(note) # Note de la fic
chapFinder = soup.find_all("script")
chapFinder = str(chapFinder[2].get_text())
chapitre = int(re.search('%s(.*)%s' % ("var totalChapters = ", ";function handleChapterKey"), chapFinder).group(1))
#print(chapitre) # nombre de chapitres dans la fic
infoFinder = soup.find("div", attrs={"class": u"center-align"})
h4 = infoFinder.find_all("h4")
titre = h4[0].get_text()
print(titre) # Titre de la fic
paragraphe = infoFinder.find("p")
paragraphe = paragraphe.get_text()
auteurListe = re.search('%s(.*)%s' % ("Par : ", "Genre : "), paragraphe).group(1)
#print(auteurListe) # auteurs de la fic
genre = re.search('%s(.*)%s' % ("Genre : ", "Statut : "), paragraphe).group(1)
#print(genre) # genre de la fic
statut = re.search('%s(.*)' % ("Statut : ",), paragraphe).group(1)
#print(statut) # statut de la fic
AllInfo = [
{"titre": titre, "auteurs": auteurListe, "genre": genre, "statut": statut, "chapitres": chapitre, "note": note,
"oldId": oldId}]
#print(AllInfo)
cur.executemany(
"""INSERT INTO fics
( uuid, titre, auteurs, genre, statut, chapitres, note, oldId)
VALUES
(uuid_generate_v1mc(), %(titre)s, %(auteurs)s, %(genre)s, %(statut)s, %(chapitres)s, %(note)s, %(oldId)s)""",
AllInfo)
conn.commit()
allchapitres = []
for i in range(1, chapitre + 1):
nexturlFinder = url.split("/")
nexturl = str(nexturlFinder[1]) + "/" + str(nexturlFinder[2] + "/" + str(i))
html = urlopen("https://www.noelfic.fr/" + nexturl).read()
time.sleep(8)
soup = BeautifulSoup.BeautifulSoup(html, "lxml")
infoFinder = soup.find("div", attrs={"class": u"center-align"})
h4 = infoFinder.find_all("h4")
chapName = h4[1].get_text()
sys.stdout.write(chapName) # Numéro du chapitre
sys.stdout.flush()
datetFinder = soup.find("p", attrs={"class": u"left-align"})
date = re.search('%s(.*)%s' % ("Publié le ", " à "), datetFinder.get_text()).group(1)
heure = re.search('%s(.*)%s' % (" à ", " par "), datetFinder.get_text()).group(1)
datetimechap = datetime.datetime.strptime(date + " " + heure, '%d/%m/%Y %H:%M:%S')
# print(datetimechap) # heure de publication
currentauteur = datetFinder.find("b").get_text()
# print(currentauteur) # auteur du chapitre
contentFinder = soup.find("div", attrs={"class": u"card grey lighten-2"})
contentFinder = contentFinder.find("div", attrs={"class": u"card-content"})
content = str(contentFinder).replace('<div class="card-content">', "").replace('</div>', "").replace("\n",
"").replace(
"\r", "").replace("\t", "")
# print(content) # contenu de la fic
allchapitres.append({"oldId": oldId, "chapitreTitle": chapName, "datetime": datetimechap, "cAuteur": currentauteur,
"content": content})
# ######## #
# COMMENTS #
# ######## #
commentFinder = soup.find_all("li", attrs={"class": u"collection-item avatar"})
allComment = []
for comment in commentFinder:
# print(comment)
pseudo = comment.find("a").get_text()
# print(pseudo)
commentget = comment.find_all("p")
commentDate = datetime.datetime.strptime(commentget[0].get_text(), '%d/%m/%Y')
# print(commentDate)
commentContent = str(commentget[1]).replace("<p>", "").replace("</p>", "")
# print(commentContent)
allComment.append(
{"pseudo": pseudo, "datetime": commentDate, "comment": commentContent, "oldId": oldId, "page": i})
cur.executemany(
"""INSERT INTO commentaires
( pseudo, dateecr, commentaire, idfic, page)
VALUES
( %(pseudo)s, %(datetime)s, %(comment)s, %(oldId)s, %(page)s)""",
allComment)
conn.commit()
sys.stdout.write('\n import..\n')
sys.stdout.flush()
cur.executemany(
"""INSERT INTO chapitres
( oldid, title, datetime, auteur, content)
VALUES
( %(oldId)s, %(chapitreTitle)s, %(datetime)s, %(cAuteur)s, %(content)s)""",
allchapitres)
conn.commit()
from urllib.request import urlopen
import bs4 as BeautifulSoup
import time
import pickle
site = "https://www.noelfic.fr/classement/date/"
url = []
for i in range(1, 125):
html = urlopen(site + str(i)).read()
print(str(i) + "/124")
soup = BeautifulSoup.BeautifulSoup(html, "lxml")
table = soup.find('tbody')
rows = table.find_all('tr')
for row in rows:
#print("#################")
cols = row.find_all('td')
t = cols[0].find('a')
url.append(t.get('href'))
#print(t.get('href'))
#titre = str(t.get('title'))
#auteur = str(cols[1].get_text()).replace("\n", "")
#date = str(cols[2].get_text())
#statut = str(cols[3].get_text())
#note = len(cols[4].find_all('img'))
print("waiting...")
time.sleep(9.5)
with open('donnees.txt', 'wb') as fichier:
mon_pickler = pickle.Pickler(fichier)
mon_pickler.dump(url)
@spartanz51
Copy link

"Python"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment