Maxime-Favier · August 30, 2019 14:03 · spartanz51 · Oct 21, 2018
diff --git a/scrapper-noelfic-all-fics.py b/scrapper-noelfic-all-fics.py
 from urllib.request import urlopen
 import bs4 as BeautifulSoup
 import time
 import re
 import datetime
 import psycopg2
 import pickle
 import sys

 try:
    conn = psycopg2.connect("dbname='scrapper' user='postgres' host='localhost' password='root'")
 except:
    raise ConnectionError

 cur = conn.cursor()

 with open('donnees.txt', 'rb') as fichier:
    mon_depickler = pickle.Unpickler(fichier)
    lien = mon_depickler.load()
    #print(len(lien))

 #lien = lien[0:2]
 for count, url in enumerate(lien):
    print(str(count+1)+ " / 2461")

    html = urlopen("https://www.noelfic.fr" + str(url)).read()

    soup = BeautifulSoup.BeautifulSoup(html, "lxml")

    # ############# #
    # FIRST CHAPTER #
    # ############# #
    # print(re.search('fic/(.+?)-', url[0]).group(1))
    oldId = int(re.search('%s(.+?)%s' % ("fic/", "-"), url).group(1))
    #print(oldId)
    noteFinder = soup.find("div", attrs={"class": u"right-align"})
    note = len(noteFinder.find_all("img"))
    #print(note)  # Note de la fic

    chapFinder = soup.find_all("script")
    chapFinder = str(chapFinder[2].get_text())
    chapitre = int(re.search('%s(.*)%s' % ("var totalChapters = ", ";function handleChapterKey"), chapFinder).group(1))
    #print(chapitre)  # nombre de chapitres dans la fic

    infoFinder = soup.find("div", attrs={"class": u"center-align"})
    h4 = infoFinder.find_all("h4")
    titre = h4[0].get_text()
    print(titre)  # Titre de la fic

    paragraphe = infoFinder.find("p")
    paragraphe = paragraphe.get_text()
    auteurListe = re.search('%s(.*)%s' % ("Par : ", "Genre : "), paragraphe).group(1)
    #print(auteurListe)  # auteurs de la fic
    genre = re.search('%s(.*)%s' % ("Genre : ", "Statut : "), paragraphe).group(1)
    #print(genre)  # genre de la fic
    statut = re.search('%s(.*)' % ("Statut : ",), paragraphe).group(1)
    #print(statut)  # statut de la fic

    AllInfo = [
        {"titre": titre, "auteurs": auteurListe, "genre": genre, "statut": statut, "chapitres": chapitre, "note": note,
         "oldId": oldId}]
    #print(AllInfo)
    cur.executemany(
        """INSERT INTO fics
        ( uuid, titre, auteurs, genre, statut, chapitres, note, oldId) 
        VALUES 
        (uuid_generate_v1mc(), %(titre)s, %(auteurs)s, %(genre)s, %(statut)s, %(chapitres)s, %(note)s, %(oldId)s)""",
        AllInfo)
    conn.commit()

    allchapitres = []
    for i in range(1, chapitre + 1):

        nexturlFinder = url.split("/")
        nexturl = str(nexturlFinder[1]) + "/" + str(nexturlFinder[2] + "/" + str(i))

        html = urlopen("https://www.noelfic.fr/" + nexturl).read()
        time.sleep(8)

        soup = BeautifulSoup.BeautifulSoup(html, "lxml")

        infoFinder = soup.find("div", attrs={"class": u"center-align"})
        h4 = infoFinder.find_all("h4")
        chapName = h4[1].get_text()
        sys.stdout.write(chapName)  # Numéro du chapitre
        sys.stdout.flush()
        datetFinder = soup.find("p", attrs={"class": u"left-align"})
        date = re.search('%s(.*)%s' % ("Publié le ", " à "), datetFinder.get_text()).group(1)
        heure = re.search('%s(.*)%s' % (" à ", " par "), datetFinder.get_text()).group(1)
        datetimechap = datetime.datetime.strptime(date + " " + heure, '%d/%m/%Y %H:%M:%S')
        # print(datetimechap)  # heure de publication
        currentauteur = datetFinder.find("b").get_text()
        # print(currentauteur)  # auteur du chapitre

        contentFinder = soup.find("div", attrs={"class": u"card grey lighten-2"})
        contentFinder = contentFinder.find("div", attrs={"class": u"card-content"})
        content = str(contentFinder).replace('<div class="card-content">', "").replace('</div>', "").replace("\n",
                                                                                                             "").replace(
            "\r", "").replace("\t", "")
        # print(content) # contenu de la fic
        allchapitres.append({"oldId": oldId, "chapitreTitle": chapName, "datetime": datetimechap, "cAuteur": currentauteur,
                          "content": content})

        # ######## #
        # COMMENTS #
        # ######## #
        commentFinder = soup.find_all("li", attrs={"class": u"collection-item avatar"})
        allComment = []
        for comment in commentFinder:
            # print(comment)
            pseudo = comment.find("a").get_text()
            # print(pseudo)
            commentget = comment.find_all("p")
            commentDate = datetime.datetime.strptime(commentget[0].get_text(), '%d/%m/%Y')
            # print(commentDate)
            commentContent = str(commentget[1]).replace("<p>", "").replace("</p>", "")
            # print(commentContent)
            allComment.append(
                {"pseudo": pseudo, "datetime": commentDate, "comment": commentContent, "oldId": oldId, "page": i})

        cur.executemany(
            """INSERT INTO commentaires
            ( pseudo, dateecr, commentaire, idfic, page) 
            VALUES 
            ( %(pseudo)s, %(datetime)s, %(comment)s, %(oldId)s, %(page)s)""",
            allComment)
        conn.commit()
    sys.stdout.write('\n import..\n')
    sys.stdout.flush()
    cur.executemany(
        """INSERT INTO chapitres
        ( oldid, title, datetime, auteur, content) 
        VALUES 
        ( %(oldId)s, %(chapitreTitle)s, %(datetime)s, %(cAuteur)s, %(content)s)""",
        allchapitres)
    conn.commit()
diff --git a/scrapper-noelfic-url_fic.py b/scrapper-noelfic-url_fic.py
 from urllib.request import urlopen
 import bs4 as BeautifulSoup
 import time
 import pickle

 site = "https://www.noelfic.fr/classement/date/"
 url = []
 for i in range(1, 125):

    html = urlopen(site + str(i)).read()
    print(str(i) + "/124")

    soup = BeautifulSoup.BeautifulSoup(html, "lxml")
    table = soup.find('tbody')

    rows = table.find_all('tr')

    for row in rows:
        #print("#################")
        cols = row.find_all('td')
        t = cols[0].find('a')
        url.append(t.get('href'))
        #print(t.get('href'))
        #titre = str(t.get('title'))
        #auteur = str(cols[1].get_text()).replace("\n", "")
        #date = str(cols[2].get_text())
        #statut = str(cols[3].get_text())
        #note = len(cols[4].find_all('img'))
    print("waiting...")
    time.sleep(9.5)

 with open('donnees.txt', 'wb') as fichier:
    mon_pickler = pickle.Pickler(fichier)
    mon_pickler.dump(url)
	from urllib.request import urlopen
	import bs4 as BeautifulSoup
	import time
	import re
	import datetime
	import psycopg2
	import pickle
	import sys

	try:
	conn = psycopg2.connect("dbname='scrapper' user='postgres' host='localhost' password='root'")
	except:
	raise ConnectionError

	cur = conn.cursor()

	with open('donnees.txt', 'rb') as fichier:
	mon_depickler = pickle.Unpickler(fichier)
	lien = mon_depickler.load()
	#print(len(lien))

	#lien = lien[0:2]
	for count, url in enumerate(lien):
	print(str(count+1)+ " / 2461")

	html = urlopen("https://www.noelfic.fr" + str(url)).read()

	soup = BeautifulSoup.BeautifulSoup(html, "lxml")

	# ############# #
	# FIRST CHAPTER #
	# ############# #
	# print(re.search('fic/(.+?)-', url[0]).group(1))
	oldId = int(re.search('%s(.+?)%s' % ("fic/", "-"), url).group(1))
	#print(oldId)
	noteFinder = soup.find("div", attrs={"class": u"right-align"})
	note = len(noteFinder.find_all("img"))
	#print(note) # Note de la fic

	chapFinder = soup.find_all("script")
	chapFinder = str(chapFinder[2].get_text())
	chapitre = int(re.search('%s(.*)%s' % ("var totalChapters = ", ";function handleChapterKey"), chapFinder).group(1))
	#print(chapitre) # nombre de chapitres dans la fic

	infoFinder = soup.find("div", attrs={"class": u"center-align"})
	h4 = infoFinder.find_all("h4")
	titre = h4[0].get_text()
	print(titre) # Titre de la fic

	paragraphe = infoFinder.find("p")
	paragraphe = paragraphe.get_text()
	auteurListe = re.search('%s(.*)%s' % ("Par : ", "Genre : "), paragraphe).group(1)
	#print(auteurListe) # auteurs de la fic
	genre = re.search('%s(.*)%s' % ("Genre : ", "Statut : "), paragraphe).group(1)
	#print(genre) # genre de la fic
	statut = re.search('%s(.*)' % ("Statut : ",), paragraphe).group(1)
	#print(statut) # statut de la fic

	AllInfo = [
	{"titre": titre, "auteurs": auteurListe, "genre": genre, "statut": statut, "chapitres": chapitre, "note": note,
	"oldId": oldId}]
	#print(AllInfo)
	cur.executemany(
	"""INSERT INTO fics
	( uuid, titre, auteurs, genre, statut, chapitres, note, oldId)
	VALUES
	(uuid_generate_v1mc(), %(titre)s, %(auteurs)s, %(genre)s, %(statut)s, %(chapitres)s, %(note)s, %(oldId)s)""",
	AllInfo)
	conn.commit()

	allchapitres = []
	for i in range(1, chapitre + 1):

	nexturlFinder = url.split("/")
	nexturl = str(nexturlFinder[1]) + "/" + str(nexturlFinder[2] + "/" + str(i))

	html = urlopen("https://www.noelfic.fr/" + nexturl).read()
	time.sleep(8)

	soup = BeautifulSoup.BeautifulSoup(html, "lxml")

	infoFinder = soup.find("div", attrs={"class": u"center-align"})
	h4 = infoFinder.find_all("h4")
	chapName = h4[1].get_text()
	sys.stdout.write(chapName) # Numéro du chapitre
	sys.stdout.flush()
	datetFinder = soup.find("p", attrs={"class": u"left-align"})
	date = re.search('%s(.*)%s' % ("Publié le ", " à "), datetFinder.get_text()).group(1)
	heure = re.search('%s(.*)%s' % (" à ", " par "), datetFinder.get_text()).group(1)
	datetimechap = datetime.datetime.strptime(date + " " + heure, '%d/%m/%Y %H:%M:%S')
	# print(datetimechap) # heure de publication
	currentauteur = datetFinder.find("b").get_text()
	# print(currentauteur) # auteur du chapitre

	contentFinder = soup.find("div", attrs={"class": u"card grey lighten-2"})
	contentFinder = contentFinder.find("div", attrs={"class": u"card-content"})
	content = str(contentFinder).replace('<div class="card-content">', "").replace('</div>', "").replace("\n",
	"").replace(
	"\r", "").replace("\t", "")
	# print(content) # contenu de la fic
	allchapitres.append({"oldId": oldId, "chapitreTitle": chapName, "datetime": datetimechap, "cAuteur": currentauteur,
	"content": content})

	# ######## #
	# COMMENTS #
	# ######## #
	commentFinder = soup.find_all("li", attrs={"class": u"collection-item avatar"})
	allComment = []
	for comment in commentFinder:
	# print(comment)
	pseudo = comment.find("a").get_text()
	# print(pseudo)
	commentget = comment.find_all("p")
	commentDate = datetime.datetime.strptime(commentget[0].get_text(), '%d/%m/%Y')
	# print(commentDate)
	commentContent = str(commentget[1]).replace("<p>", "").replace("</p>", "")
	# print(commentContent)
	allComment.append(
	{"pseudo": pseudo, "datetime": commentDate, "comment": commentContent, "oldId": oldId, "page": i})

	cur.executemany(
	"""INSERT INTO commentaires
	( pseudo, dateecr, commentaire, idfic, page)
	VALUES
	( %(pseudo)s, %(datetime)s, %(comment)s, %(oldId)s, %(page)s)""",
	allComment)
	conn.commit()
	sys.stdout.write('\n import..\n')
	sys.stdout.flush()
	cur.executemany(
	"""INSERT INTO chapitres
	( oldid, title, datetime, auteur, content)
	VALUES
	( %(oldId)s, %(chapitreTitle)s, %(datetime)s, %(cAuteur)s, %(content)s)""",
	allchapitres)
	conn.commit()