Skip to content

Instantly share code, notes, and snippets.

@torson
Created August 18, 2020 15:20
Show Gist options
  • Save torson/66f5d74920061636b19f39e0116f51ff to your computer and use it in GitHub Desktop.
Save torson/66f5d74920061636b19f39e0116f51ff to your computer and use it in GitHub Desktop.
nepremicnine_scraper.sh
#!/bin/bash
DOMAIN="https://www.nepremicnine.net"
MAIN_URL="/oglasi-oddaja/ljubljana-mesto/stanovanje/garsonjera,1-sobno,1.5-sobno,2-sobno/cena-od-450-do-700-eur-na-mesec,velikost-od-38-do-60-m2/?s=14&nadst%5B0%5D=vsa&nadst%5B1%5D=vsa"
INDEX_FILE="/tmp/index.txt"
FIRST_PAGE="/tmp/page1.html"
TMP_FILE="/tmp/tmp.html"
SLEEP_BETWEEN_REQUESTS_SECONDS=10
touch ${INDEX_FILE}
curl -q -o ${FIRST_PAGE} ${DOMAIN}${MAIN_URL} >/dev/null 2>&1
LIST_OF_PAGES=$(grep -o -P '\/oglasi-oddaja\/[^"]+' ${FIRST_PAGE} | grep -P "=vsa$" | grep -P "/[0-9]/" | sort | uniq)
LIST_OF_FLATS=$(cat ${FIRST_PAGE} | grep -o -P '\/oglasi-oddaja\/[^"]+' | grep -v /ljubljana-mesto/ | grep -v -P "=vsa$" | grep -v -P "/[0-9]/" | grep -v "/oglasi-oddaja/?" | sort | uniq)
for FLAT in ${LIST_OF_FLATS}; do
# echo ${FLAT}
if ! grep ${FLAT} ${INDEX_FILE} >/dev/null ; then
echo new flat! ${DOMAIN}${FLAT}
echo ${FLAT} >> ${INDEX_FILE}
fi
done
sleep ${SLEEP_BETWEEN_REQUESTS_SECONDS}
for URL in ${LIST_OF_PAGES}; do
# echo ${URL}
LIST_OF_FLATS=$(curl -q -o ${TMP_FILE} "${DOMAIN}${URL}" >/dev/null 2>&1 ; grep -o -P '\/oglasi-oddaja\/[^"]+' ${TMP_FILE} | grep -v /ljubljana-mesto/ | grep -v -P "=vsa$" | grep -v -P "/[0-9]/" | grep -v "/oglasi-oddaja/?" | sort | uniq)
for FLAT in ${LIST_OF_FLATS}; do
# echo ${FLAT}
if ! grep ${FLAT} ${INDEX_FILE} >/dev/null ; then
echo new flat! ${DOMAIN}${FLAT}
echo ${FLAT} >> ${INDEX_FILE}
fi
done
sleep ${SLEEP_BETWEEN_REQUESTS_SECONDS}
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment