Last active
April 16, 2025 02:45
-
-
Save yurukov/1273e8eca40ebfce695bd2da144ef856 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Important notes: | |
1. Please run this command only during the night when load on the server is low | |
2. This loads all index files and documents from the opendata portal | |
3. The script saves your last download time and downloads only those files that are marked as updates after | |
4. Expected download size should be about 6G | |
Steps: | |
1. Open https://kais.cadastre.bg/bg/OpenData | |
2. Open DevTools | |
3. Find the second Fetch/XHR call to https://kais.cadastre.bg/bg/OpenData/Read (the one with payload) | |
4. Copy it as curl command. It should have '--data-raw' at the end | |
5. Paste that command between the two mentions of 'endOfCommand' in the script replacing the command there | |
6. Run the script with bash |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
mkdir -p index | |
mkdir -p doc | |
read -r -d '' curlcom <<- 'endOfCommand' | |
curl 'https://kais.cadastre.bg/bg/OpenData/Read' \ | |
-H 'Accept: */*' \ | |
-H 'Accept-Language: en-US,en;q=0.9,bg;q=0.8,ar;q=0.7' \ | |
-H 'Cache-Control: no-cache' \ | |
-H 'Connection: keep-alive' \ | |
-H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \ | |
-H $'Cookie: csrf=CfDJ8N1Zr3r-DYZInfQcXSB9Jdu8vN8S8SGkP1qZza2NCmsFm5e2K6Fqk__Y7nEhFe7GlVf2T4jAN7wotc3YahNLDsl7DQsmAjDIXWRPcSVcrgdYXtdKTPT-Q8NC5PtXXfr8Mxj_iOvsoSMk8_aoYKgAmDo; s=CfDJ8N1Zr3r%2BDYZInfQcXSB9JdvuUfWX5HmqQ80Wn2TaUjH6rSaEey18vJJyxlHffxTFXA2dhWjzbUBHu6jHhErk0QJLbWaQYQ5M%2Bzlq%2FlZm6aXZ4E3mHielNAVyE%2FC9WagpR4pcKX2eRdFonA8kyP%2Fl3Pu1%2FYXo45KBan%2BsD6kPe7KE; cookiesgdpr=true; k-cookie=\u0021AIkeD5f4KbhJnBpYaued4ZpDLVf3ZOkIjGNvE1vFIuGRnTXNpeucsLcldRWeMKVzxd2MILZzOEpPLA==; TS01d36efd=01700d78c4508b08d4a924bc26e8ccb01c545be2a2aadcfc225788282cb01aeb5f1ff2976041b57bf22c07dfcf632dd11cd74206fec19d928f1e5a3e8b51c109beff9ae578' \ | |
-H 'Origin: https://kais.cadastre.bg' \ | |
-H 'Pragma: no-cache' \ | |
-H 'Referer: https://kais.cadastre.bg/bg/OpenData' \ | |
-H 'Sec-Fetch-Dest: empty' \ | |
-H 'Sec-Fetch-Mode: cors' \ | |
-H 'Sec-Fetch-Site: same-origin' \ | |
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' \ | |
-H 'X-CSRF-TOKEN: CfDJ8IHdlVPWlvJIkB5g9NxdbAcjm-y5YozDYReNq5ky0i0n6DQ5BLyvSqqkAjL6v-aVYNQva8Cy___6BScVfsjrYd4p3myBVYe0fpHB3WCinqysQZjulNeAn8l45J7Pb6oDCy5UHnwHubQWYbLA2Bc3EbY' \ | |
-H 'X-Requested-With: XMLHttpRequest' \ | |
-H 'sec-ch-ua: "Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"' \ | |
-H 'sec-ch-ua-mobile: ?0' \ | |
-H 'sec-ch-ua-platform: "Linux"' \ | |
--data-raw 'path=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81&target=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81' | |
endOfCommand | |
curlcom=`echo "$curlcom" | sed 's_curl_curl -s_;$s_--data-raw.*__'`; | |
curlcom1=`echo "$curlcom" | sed 's_Content-Type:.*_Content-Length: 0'"'"' \\\\_'`; | |
if [ ! -f LASTSCRAPE ]; then | |
echo "1970-01-01" > LASTSCRAPE; | |
fi | |
lastscrape=`cat LASTSCRAPE`; | |
echo "Last scrape was at $lastscrape"; | |
lastscrape=`date -d "$lastscrape" +%s`; | |
echo "Downloading regions"; | |
eval "$curlcom1 -X 'POST'" > index/obsh_ref.0.json; | |
echo -n "Downloading municipalities "; | |
jq -r .[].Path index/obsh_ref.0.json | { while read i; do | |
f=`echo "$i" | sed 's_област __;s_[()]\+__g'`; | |
eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.1.json"; | |
echo -n "."; | |
done } | |
echo " done"; | |
echo -n "Downloading settlements "; | |
jq -r .[].Path index/*.1.json | { while read i; do | |
f=`echo "$i" | sed 's_област \|община __g;s_[()]\+__g;s_/_-_g'`; | |
eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.2.json"; | |
echo -n "."; | |
done } | |
echo " done"; | |
echo -n "Downloading document list "; | |
jq -r .[].Path index/*.2.json | { while read i; do | |
f=`echo "$i" | sed 's_област \|община \|с\. \|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`; | |
eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.3.json"; | |
echo -n "."; | |
done } | |
echo " done"; | |
echo "Downloaded index files: "`ls index|wc -l`" data: "`du -sh index|sed 's_\s*index__'`; | |
echo -n "Downloading archives "; | |
jq -r '.[]|[.Path,.Modified]|@csv' index/*.3.json | { while read i; do | |
fd=`echo "$i" | sed 's_.*","__;s_"__g'`; | |
fd=`date -d "$fd" +%s`; | |
if [ $lastscrape -gt $fd ]; then | |
echo -n "x"; | |
else | |
f=`echo "$i"| sed 's_",".*__;s_"__g;s_област \|община \|с\. \|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`; | |
d=`echo "$i"| sed 's_",".*__;s_"__g' | jq -sRr @uri | sed 's_\.zip.*_.zip_'`; | |
u=`echo "$curlcom" | sed 's_OpenData/Read_OpenData/Download?path='"$d"'_;$s_\$__'`; | |
eval "$u" > "doc/$f"; | |
echo -n "."; | |
fi; | |
done } | |
echo " done"; | |
echo "Downloaded doc files: "`ls doc|wc -l`" data: "`du -sh doc|sed 's_\s*doc__'`; | |
date +"%Y-%m-%d %H:%M" > LASTSCRAPE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've added a mechanism to only download updated files. If you run it in the same folder it will update your list of archives. It still downloads the whole index list, however.