Last active
September 18, 2019 01:24
-
-
Save Generalelektrix/52c2d09fc82c99dd96e4a71242ad5433 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use LWP::Simple; | |
use URI::URL; | |
=head1 Synopsis | |
This script will produce a file of RedirectSafe directives with domains from the DOAJ csv file publicly available. | |
The main concern is to avoid duplicates of any domains that are already used in your own EZproxy config files. | |
The new config file produced is called config_doaj.txt and can be appended to your own EZproxy main config file | |
this way: | |
IncludeFile config_doaj.txt | |
=cut | |
=head2 Author | |
Sébastien Nadeau | |
Université Laval | |
=cut | |
# URL where DOAJ csv file is found | |
my $url_doaj_csv = qq[https://doaj.org/csv]; | |
# Will we fetch the DOAJ csv file? | |
my $go_get_it_again = 1; | |
# Temporary DOAJ csv file | |
my $csv_doaj_filename = qq[csv_doaj.txt]; | |
# New DOAJ config file | |
my $cfg_doaj_filename = qq[config_doaj.txt]; | |
# Logfile | |
my $log_doaj_filename = qq[config_doaj.log]; | |
# List of actual EZproxy config files | |
# Add more files to this list if necessary | |
my @ezproxy_cfg_files = ( | |
"config.txt", | |
); | |
# EZproxy base directory (where EZproxy config file is found) | |
my $ezproxy_dir = qq[/usr/local/ezproxy]; | |
open my $logfile, qq[>$ezproxy_dir/$log_doaj_filename] or die $!; | |
if ($go_get_it_again) { | |
my $response_code = getstore($url_doaj_csv, qq[$ezproxy_dir/$csv_doaj_filename]); | |
if (is_error($response_code)) { | |
print $logfile "Response code: $response_code -- Aborting.\n"; | |
die; | |
} | |
else { | |
print $logfile "Fetched $url_doaj_csv sucessfully.\n"; | |
} | |
print $logfile "\n"; | |
} | |
else { | |
print $logfile "Won't fetch $url_doaj_csv again.\n"; | |
print $logfile "\n"; | |
} | |
# Opening DOAJ csv file and reduction to a hashmap of domains | |
my %doaj_domains; | |
if (open my $csv, qq[<$ezproxy_dir/$csv_doaj_filename]) { | |
while (my $line = <$csv>) { | |
chomp $line; | |
my $url = new URI::URL ((split /,/, $line)[1]); | |
if ($url->can("host") and $url->can("port")) { | |
my $domain; | |
if ($url->port != 80 and $url->port != 443) { | |
$domain = $url->host . ":" . $url->port; | |
} | |
else { | |
$domain = $url->host; | |
} | |
if (defined($domain) and $domain) { | |
$doaj_domains{$domain}++; | |
} | |
} | |
} | |
my $number_of_domains = scalar keys %doaj_domains; | |
print $logfile "DOAJ list contains $number_of_domains unique domains.\n"; | |
close $csv; | |
} | |
else { | |
print $logfile "$! -- Aborting.\n"; | |
die $!; | |
} | |
# If DOAJ domains are found in EZproxy config, remove them from hashmap | |
# There are 2 possibilities: | |
# - domain is already in a RedirectSafe directive | |
# - domain is found in a database stanza | |
my $ct = 0; | |
my $ct_db = 0; | |
my $ct_rs = 0; | |
foreach my $ezproxy_cfg_filename (@ezproxy_cfg_files) { | |
if (open my $cfg, qq[<$ezproxy_dir/$ezproxy_cfg_filename]) { | |
while (my $line = <$cfg>) { | |
chomp $line; | |
$line =~s/\r\n//g; | |
if ($line=~/^redirectsafe\s+(.*)/i) { | |
my $domain = lc($1); | |
$domain =~s/\/.*$//; | |
if (defined $doaj_domains{$domain}) { | |
delete $doaj_domains{$domain}; | |
print $logfile "$domain already in redirectsafe directive of config, removed from DOAJ list\n"; | |
$ct_rs++; | |
} | |
} | |
elsif ($line=~/^(U|URL|H|HJ|Host|HostJavascript|D|DJ|Domain|DomainJavascript)\s+(.*)/i) { | |
$ct++; | |
my $value = lc($2); | |
my $domain; | |
if ($value =~/^\w+:\/\//) { | |
my $url = new URI::URL ($value); | |
$domain = $url->host; | |
} | |
else { | |
$domain = $value; | |
$domain =~s/\/.*$//; | |
} | |
if ($domain) { | |
foreach my $doaj_domain (keys %doaj_domains) { | |
if ($domain =~/(\A|\.)$doaj_domain$/) { | |
delete $doaj_domains{$doaj_domain}; | |
$ct_db++; | |
print $logfile "$domain <==> $doaj_domain already in database stanza, removed from DOAJ list\n"; | |
} | |
} | |
} | |
} | |
} | |
close $cfg; | |
} | |
else { | |
print $logfile "Unable to open $ezproxy_cfg_filename: $!\n"; | |
} | |
} | |
print $logfile "$ct_rs domains removed from DOAJ list because already in RedirectSafe\n"; | |
print $logfile "$ct_db domains removed from DOAJ list because already in Database Stanza\n"; | |
my $number_of_domains = scalar keys %doaj_domains; | |
print $logfile "DOAJ list now contains $number_of_domains unique domains.\n"; | |
# Creating new config file with DOAJ domains in RedirectSafe directives | |
if (open my $doaj_cfg, qq[>$ezproxy_dir/$cfg_doaj_filename]) { | |
foreach my $domain (sort keys %doaj_domains) { | |
print $doaj_cfg qq[RedirectSafe $domain\n]; | |
} | |
print $logfile "$ezproxy_dir/$cfg_doaj_filename created successfully.\n"; | |
close $doaj_cfg; | |
} | |
else { | |
print $logfile "$! -- Aborting.\n"; | |
die $!; | |
} | |
close $logfile; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment