|
#!/usr/bin/env bash |
|
set -euo pipefail |
|
|
|
readonly BASE_URL_TPL='https://www.fakku.net/tags/@tag/page/@page_num' |
|
readonly TAGS_URL='https://www.fakku.net/tags' |
|
readonly USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0' |
|
|
|
readonly SCRIPT_USAGE="$(cat <<EOF |
|
Usage: $0 (--print-all-tags) TAG |
|
|
|
Scrape a list of Fakku authors who have works tagged with TAG. |
|
Pipe STDOUT to sort -u to deduplicate the results (if needed). |
|
|
|
If --print-tags-list specified, print a list of all available |
|
tags in 'TAG_PRETTY_NAME: TAG' format. Do not use pretty names |
|
as arguments to this script as that will not work. |
|
|
|
Example: |
|
# Print a list of available tags to STDOUT |
|
$0 --print-all-tags |
|
|
|
# Scrape all author names who have works tagged with 'ecchi' |
|
# tag. Deduplicate and save to ./fakku_ecchi_authors.txt file |
|
# in the current directory. |
|
$0 ecchi | sort -u | tee ./fakku_ecchi_authors.txt |
|
EOF |
|
)" |
|
|
|
|
|
function main { |
|
if test "$#" -ne 1; then |
|
>&2 echo "${SCRIPT_USAGE}" |
|
exit 1 |
|
fi |
|
|
|
if ! command -v curl &> /dev/null; then |
|
>&2 echo 'Error: Failed to locate curl command' |
|
exit 1 |
|
fi |
|
|
|
local tag |
|
tag="$1" |
|
shift |
|
|
|
if test "${tag}" == "--print-all-tags"; then |
|
>&2 echo '---(i) INFO: Scraping a list of all Fakku tags' |
|
|
|
curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}" "${TAGS_URL}" | \ |
|
sed -ne 's@^.*href="/tags/\(..*\)">\(..*\)</.*$@\2: \1@p' | \ |
|
sort -u |
|
|
|
exit 0 |
|
fi |
|
|
|
local page_num |
|
page_num=1 |
|
|
|
while true; do |
|
>&2 echo "---(i) INFO: Scraping Fakku author names: tag=\"${tag}\", page=${page_num}" |
|
|
|
local url |
|
url="${BASE_URL_TPL/@tag/"${tag}"}" |
|
url="${url/@page_num/"${page_num}"}" |
|
|
|
curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}" "${url}" | \ |
|
grep -A 1 'href="/artists/' | \ |
|
grep '</a>' | \ |
|
cut -f1 -d'<' | \ |
|
awk '{$1=$1};1' |
|
|
|
page_num=$((page_num + 1)) |
|
done |
|
|
|
} |
|
|
|
main "$@" |