celsowm · January 31, 2025 22:00
diff --git a/codigo_civil_scrapping.py b/codigo_civil_scrapping.py
 import requests
 from bs4 import BeautifulSoup
 import re
 import json

 # URL da página que contém os artigos
 url = "https://www.planalto.gov.br/ccivil_03/leis/2002/l10406compilada.htm"

 # Headers para imitar um navegador
 headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/112.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
    ),
 }

 try:
    # Fazendo a requisição HTTP
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Verifica se a requisição foi bem-sucedida
 except requests.exceptions.RequestException as e:
    print(f"Erro ao acessar a página: {e}")
    exit()

 # Ajuste de codificação, se necessário
 detected_encoding = response.encoding
 print(f"Codificação detectada inicialmente: {detected_encoding}")

 if detected_encoding.lower() not in ['utf-8', 'iso-8859-1', 'windows-1252']:
    soup_detect = BeautifulSoup(response.content, 'html.parser')
    detected_encoding = soup_detect.original_encoding
    print(f"Codificação detectada pelo BeautifulSoup: {detected_encoding}")
    response.encoding = detected_encoding
 else:
    response.encoding = detected_encoding

 # Parseando o conteúdo HTML com a codificação correta
 soup = BeautifulSoup(response.text, 'html.parser')
 paragrafos = soup.find_all('p')

 artigos = []
 artigo_atual = {}

 # Regex para identificar parágrafos que começam com "Art." (aceitando opcionalmente "o")
 regex_art = re.compile(r'^Art\. ?(?:o\s*)?(\d+(?:\.\d+)*)', re.IGNORECASE)

 for p in paragrafos:
    # Ignora parágrafos centralizados (títulos de capítulos)
    align = p.get('align', '')
    if align.lower() == 'center':
        continue

    texto = p.get_text(separator=' ', strip=True)
    match = regex_art.match(texto)
    
    if match:
        # Se já está coletando um artigo, finaliza-o
        if artigo_atual:
            artigos.append(artigo_atual)
            artigo_atual = {}
        
        numero_str = match.group(1)
        try:
            numero_int = int(numero_str.replace(".", ""))
        except ValueError:
            numero_int = None
        
        header = match.group(0)
        texto_sem_header = texto[len(header):].lstrip(" .:-")
        # Se o texto iniciar exatamente com "o " (minúsculo ou maiúsculo), remove-o
        if texto_sem_header.startswith("o "):
            texto_sem_header = texto_sem_header[2:].lstrip()
        
        artigo_atual = {
            "numero": numero_int,
            "texto": texto_sem_header
        }
    elif artigo_atual:
        # Se o parágrafo atual não possui cabeçalho de novo artigo, agrega seu conteúdo.
        # Novamente, ignora parágrafos centralizados.
        if p.get('align', '').lower() == 'center':
            continue
        texto_limpo = re.sub(r'\s+', ' ', texto)
        artigo_atual["texto"] += " " + texto_limpo

 # Adiciona o último artigo coletado, se houver
 if artigo_atual:
    artigos.append(artigo_atual)

 # Exibe os artigos coletados (opcional)
 for artigo in artigos:
    print(f"Art. {artigo['numero']}: {artigo['texto']}\n")

 # Salvando os artigos em um arquivo JSON
 try:
    with open("artigos.json", "w", encoding="utf-8") as f:
        json.dump(artigos, f, ensure_ascii=False, indent=4)
    print("Artigos salvos com sucesso no arquivo 'artigos.json'.")
 except IOError as e:
    print(f"Erro ao salvar o arquivo JSON: {e}")
	import requests
	from bs4 import BeautifulSoup
	import re
	import json

	# URL da página que contém os artigos
	url = "https://www.planalto.gov.br/ccivil_03/leis/2002/l10406compilada.htm"

	# Headers para imitar um navegador
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/112.0.0.0 Safari/537.36"
	),
	"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
	"Accept": (
	"text/html,application/xhtml+xml,application/xml;"
	"q=0.9,image/avif,image/webp,image/apng,/;q=0.8"
	),
	}

	try:
	# Fazendo a requisição HTTP
	response = requests.get(url, headers=headers)
	response.raise_for_status() # Verifica se a requisição foi bem-sucedida
	except requests.exceptions.RequestException as e:
	print(f"Erro ao acessar a página: {e}")
	exit()

	# Ajuste de codificação, se necessário
	detected_encoding = response.encoding
	print(f"Codificação detectada inicialmente: {detected_encoding}")

	if detected_encoding.lower() not in ['utf-8', 'iso-8859-1', 'windows-1252']:
	soup_detect = BeautifulSoup(response.content, 'html.parser')
	detected_encoding = soup_detect.original_encoding
	print(f"Codificação detectada pelo BeautifulSoup: {detected_encoding}")
	response.encoding = detected_encoding
	else:
	response.encoding = detected_encoding

	# Parseando o conteúdo HTML com a codificação correta
	soup = BeautifulSoup(response.text, 'html.parser')
	paragrafos = soup.find_all('p')

	artigos = []
	artigo_atual = {}

	# Regex para identificar parágrafos que começam com "Art." (aceitando opcionalmente "o")
	regex_art = re.compile(r'^Art\. ?(?:o\s)?(\d+(?:\.\d+))', re.IGNORECASE)

	for p in paragrafos:
	# Ignora parágrafos centralizados (títulos de capítulos)
	align = p.get('align', '')
	if align.lower() == 'center':
	continue

	texto = p.get_text(separator=' ', strip=True)
	match = regex_art.match(texto)

	if match:
	# Se já está coletando um artigo, finaliza-o
	if artigo_atual:
	artigos.append(artigo_atual)
	artigo_atual = {}

	numero_str = match.group(1)
	try:
	numero_int = int(numero_str.replace(".", ""))
	except ValueError:
	numero_int = None

	header = match.group(0)
	texto_sem_header = texto[len(header):].lstrip(" .:-")
	# Se o texto iniciar exatamente com "o " (minúsculo ou maiúsculo), remove-o
	if texto_sem_header.startswith("o "):
	texto_sem_header = texto_sem_header[2:].lstrip()

	artigo_atual = {
	"numero": numero_int,
	"texto": texto_sem_header
	}
	elif artigo_atual:
	# Se o parágrafo atual não possui cabeçalho de novo artigo, agrega seu conteúdo.
	# Novamente, ignora parágrafos centralizados.
	if p.get('align', '').lower() == 'center':
	continue
	texto_limpo = re.sub(r'\s+', ' ', texto)
	artigo_atual["texto"] += " " + texto_limpo

	# Adiciona o último artigo coletado, se houver
	if artigo_atual:
	artigos.append(artigo_atual)

	# Exibe os artigos coletados (opcional)
	for artigo in artigos:
	print(f"Art. {artigo['numero']}: {artigo['texto']}\n")

	# Salvando os artigos em um arquivo JSON
	try:
	with open("artigos.json", "w", encoding="utf-8") as f:
	json.dump(artigos, f, ensure_ascii=False, indent=4)
	print("Artigos salvos com sucesso no arquivo 'artigos.json'.")
	except IOError as e:
	print(f"Erro ao salvar o arquivo JSON: {e}")