Skip to content

Instantly share code, notes, and snippets.

@Xnuvers007
Created October 12, 2024 09:57
Show Gist options
  • Save Xnuvers007/78984e4aa8c37e19ab1c91540caf0edf to your computer and use it in GitHub Desktop.
Save Xnuvers007/78984e4aa8c37e19ab1c91540caf0edf to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
cookies = {
'wordpress_test_cookie': 'WP+Cookie+check',
'wp-settings-1': 'edit_element_vcUIPanelWidth%3D1021%26ampampampampedit_element_vcUIPanelLeft%3D14px%26ampampampampedit_element_vcUIPanelTop%3D59px%26ampampampampeditor%3Dtinymce%26ampampampamplibraryContent%3Dbrowse%26ampampampampimgsize%3Dfull%26ampamplibraryContent%3Dbrowse%26ampampmfold%3Do%26ampampedit_element_vcUIPanelLeft%3D67px%26ampampedit_element_vcUIPanelTop%3D136px%26ampampposts_list_mode%3Dlist%26uploader%3D1%26editor%3Dtinymce%26edit_element_vcUIPanelLeft%3D230px%26edit_element_vcUIPanelTop%3D137px%26libraryContent%3Dbrowse%26template_window_vcUIPanelWidth%3D1212%26template_window_vcUIPanelLeft%3D142px%26template_window_vcUIPanelTop%3D74px%26mfold%3Do',
'wp-settings-time-1': '1719133983',
'PHPSESSID': 'cc28vdiqhvlek3jv49hib7smd2',
'sc_is_visitor_unique': 'rx12856607.1728455295.7B514FD2EE304FEE97F9BC36B4D3A245.2.2.2.2.2.2.2.2.1',
'vCentminmod': '8735cc5bae5801c849211396cc63cd91',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'id,en;q=0.9,en-GB;q=0.8,en-US;q=0.7,sv;q=0.6',
'Connection': 'keep-alive',
# 'Cookie': '_ga=GA1.1.1628365940.1714144651; _ga_SDV98CJGSQ=GS1.1.1717469011.8.1.1717469648.0.0.0; wordpress_test_cookie=WP+Cookie+check; wp-settings-1=edit_element_vcUIPanelWidth%3D1021%26ampampampampedit_element_vcUIPanelLeft%3D14px%26ampampampampedit_element_vcUIPanelTop%3D59px%26ampampampampeditor%3Dtinymce%26ampampampamplibraryContent%3Dbrowse%26ampampampampimgsize%3Dfull%26ampamplibraryContent%3Dbrowse%26ampampmfold%3Do%26ampampedit_element_vcUIPanelLeft%3D67px%26ampampedit_element_vcUIPanelTop%3D136px%26ampampposts_list_mode%3Dlist%26uploader%3D1%26editor%3Dtinymce%26edit_element_vcUIPanelLeft%3D230px%26edit_element_vcUIPanelTop%3D137px%26libraryContent%3Dbrowse%26template_window_vcUIPanelWidth%3D1212%26template_window_vcUIPanelLeft%3D142px%26template_window_vcUIPanelTop%3D74px%26mfold%3Do; wp-settings-time-1=1719133983; PHPSESSID=cc28vdiqhvlek3jv49hib7smd2; _ga_QZS0XPFQMJ=GS1.1.1720543539.8.0.1720543539.0.0.0; sc_is_visitor_unique=rx12856607.1728455295.7B514FD2EE304FEE97F9BC36B4D3A245.2.2.2.2.2.2.2.2.1; vCentminmod=8735cc5bae5801c849211396cc63cd91',
'Referer': 'https://repository.unpam.ac.id/cgi/search/advanced',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'screen': 'Search',
'dataset': 'archive',
'documents_merge': 'ALL',
'documents': '',
'title_merge': 'ALL',
'title': 'Rekayasa perangkat lunak',
'creators_name_merge': 'ALL',
'creators_name': '',
'abstract_merge': 'ALL',
'abstract': '',
'date': '',
'keywords_merge': 'ALL',
'keywords': '',
'subjects_merge': 'ANY',
'department_merge': 'ALL',
'department': '',
'editors_name_merge': 'ALL',
'editors_name': '',
'refereed': 'EITHER',
'publication_merge': 'ALL',
'publication': '',
'satisfyall': 'ALL',
'order': '-date/creators_name/title',
'_action_search': 'Search',
}
response = requests.get(
'https://repository.unpam.ac.id/cgi/search/archive/advanced',
params=params,
cookies=cookies,
headers=headers,
verify=False
)
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
div = soup.find('div', attrs={'class': 'ep_search_results'})
results = div.find_all('tr', class_='ep_search_result')
# Loop through each result and extract desired information
for result in results:
# Extract authors
authors = result.find_all('span', class_='person_name')
authors_list = [author.get_text(strip=True) for author in authors]
# Extract title and link
title_tag = result.find('a')
title = title_tag.get_text(strip=True)
link = title_tag['href']
# Extract publication details (if available)
publication_info = result.find('td', align='center')
document_links = [link['href'] for link in publication_info.find_all('a')]
# Print extracted information
print("Authors:", ", ".join(authors_list))
print("Title:", title)
print("Link:", link)
print("Document Links:", ", ".join(document_links))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment