yurukov · April 16, 2025 02:45 · yurukov · Feb 4, 2025
diff --git a/Bulgarian cadastre opendata scrape b/Bulgarian cadastre opendata scrape
 Important notes:
 1. Please run this command only during the night when load on the server is low
 2. This loads all index files and documents from the opendata portal
 3. The script saves your last download time and downloads only those files that are marked as updates after
 4. Expected download size should be about 6G

 Steps:
 1. Open https://kais.cadastre.bg/bg/OpenData
 2. Open DevTools
 3. Find the second Fetch/XHR call to https://kais.cadastre.bg/bg/OpenData/Read (the one with payload)
 4. Copy it as curl command. It should have '--data-raw' at the end 
 5. Paste that command between the two mentions of 'endOfCommand' in the script replacing the command there
 6. Run the script with bash
diff --git a/scrape.sh b/scrape.sh
 #!/bin/bash

 mkdir -p index
 mkdir -p doc
 read -r -d '' curlcom <<- 'endOfCommand'
 curl 'https://kais.cadastre.bg/bg/OpenData/Read' \
  -H 'Accept: */*' \
  -H 'Accept-Language: en-US,en;q=0.9,bg;q=0.8,ar;q=0.7' \
  -H 'Cache-Control: no-cache' \
  -H 'Connection: keep-alive' \
  -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
  -H $'Cookie: csrf=CfDJ8N1Zr3r-DYZInfQcXSB9Jdu8vN8S8SGkP1qZza2NCmsFm5e2K6Fqk__Y7nEhFe7GlVf2T4jAN7wotc3YahNLDsl7DQsmAjDIXWRPcSVcrgdYXtdKTPT-Q8NC5PtXXfr8Mxj_iOvsoSMk8_aoYKgAmDo; s=CfDJ8N1Zr3r%2BDYZInfQcXSB9JdvuUfWX5HmqQ80Wn2TaUjH6rSaEey18vJJyxlHffxTFXA2dhWjzbUBHu6jHhErk0QJLbWaQYQ5M%2Bzlq%2FlZm6aXZ4E3mHielNAVyE%2FC9WagpR4pcKX2eRdFonA8kyP%2Fl3Pu1%2FYXo45KBan%2BsD6kPe7KE; cookiesgdpr=true; k-cookie=\u0021AIkeD5f4KbhJnBpYaued4ZpDLVf3ZOkIjGNvE1vFIuGRnTXNpeucsLcldRWeMKVzxd2MILZzOEpPLA==; TS01d36efd=01700d78c4508b08d4a924bc26e8ccb01c545be2a2aadcfc225788282cb01aeb5f1ff2976041b57bf22c07dfcf632dd11cd74206fec19d928f1e5a3e8b51c109beff9ae578' \
  -H 'Origin: https://kais.cadastre.bg' \
  -H 'Pragma: no-cache' \
  -H 'Referer: https://kais.cadastre.bg/bg/OpenData' \
  -H 'Sec-Fetch-Dest: empty' \
  -H 'Sec-Fetch-Mode: cors' \
  -H 'Sec-Fetch-Site: same-origin' \
  -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' \
  -H 'X-CSRF-TOKEN: CfDJ8IHdlVPWlvJIkB5g9NxdbAcjm-y5YozDYReNq5ky0i0n6DQ5BLyvSqqkAjL6v-aVYNQva8Cy___6BScVfsjrYd4p3myBVYe0fpHB3WCinqysQZjulNeAn8l45J7Pb6oDCy5UHnwHubQWYbLA2Bc3EbY' \
  -H 'X-Requested-With: XMLHttpRequest' \
  -H 'sec-ch-ua: "Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"' \
  -H 'sec-ch-ua-mobile: ?0' \
  -H 'sec-ch-ua-platform: "Linux"' \
  --data-raw 'path=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81&target=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81'
 endOfCommand

 curlcom=`echo "$curlcom" | sed 's_curl_curl -s_;$s_--data-raw.*__'`; 
 curlcom1=`echo "$curlcom" | sed 's_Content-Type:.*_Content-Length: 0'"'"' \\\\_'`;

 if [ ! -f LASTSCRAPE ]; then
  echo "1970-01-01" > LASTSCRAPE;
 fi
 lastscrape=`cat LASTSCRAPE`;
 echo "Last scrape was at $lastscrape";
 lastscrape=`date -d "$lastscrape" +%s`;

 echo "Downloading regions";
 eval "$curlcom1 -X 'POST'" > index/obsh_ref.0.json;

 echo -n "Downloading municipalities ";
 jq -r .[].Path index/obsh_ref.0.json | { while read i; do 
  f=`echo "$i" | sed 's_област __;s_[()]\+__g'`;
  eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.1.json";
  echo -n ".";
 done }
 echo " done";

 echo -n "Downloading settlements ";
 jq -r .[].Path index/*.1.json | { while read i; do 
  f=`echo "$i" | sed 's_област \|община __g;s_[()]\+__g;s_/_-_g'`;
  eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.2.json";
  echo -n ".";
 done }
 echo " done";

 echo -n "Downloading document list ";
 jq -r .[].Path index/*.2.json | { while read i; do 
  f=`echo "$i" | sed 's_област \|община \|с\. \|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`;
  eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.3.json";
  echo -n ".";
 done }
 echo " done";

 echo "Downloaded index files: "`ls index|wc -l`" data: "`du -sh index|sed 's_\s*index__'`;

 echo -n "Downloading archives ";
 jq -r '.[]|[.Path,.Modified]|@csv' index/*.3.json | { while read i; do 
  fd=`echo "$i" | sed 's_.*","__;s_"__g'`; 
  fd=`date -d "$fd" +%s`;
  if [ $lastscrape -gt $fd ]; then
    echo -n "x";
  else
    f=`echo "$i"| sed 's_",".*__;s_"__g;s_област \|община \|с\. \|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`;
    d=`echo "$i"| sed 's_",".*__;s_"__g' | jq -sRr @uri | sed 's_\.zip.*_.zip_'`;
    u=`echo "$curlcom" | sed 's_OpenData/Read_OpenData/Download?path='"$d"'_;$s_\$__'`;
    eval "$u" > "doc/$f";
    echo -n ".";
  fi;
 done }
 echo " done";

 echo "Downloaded doc files: "`ls doc|wc -l`" data: "`du -sh doc|sed 's_\s*doc__'`;

 date +"%Y-%m-%d %H:%M" > LASTSCRAPE
	Important notes:
	1. Please run this command only during the night when load on the server is low
	2. This loads all index files and documents from the opendata portal
	3. The script saves your last download time and downloads only those files that are marked as updates after
	4. Expected download size should be about 6G

	Steps:
	1. Open https://kais.cadastre.bg/bg/OpenData
	2. Open DevTools
	3. Find the second Fetch/XHR call to https://kais.cadastre.bg/bg/OpenData/Read (the one with payload)
	4. Copy it as curl command. It should have '--data-raw' at the end
	5. Paste that command between the two mentions of 'endOfCommand' in the script replacing the command there
	6. Run the script with bash
	#!/bin/bash

	mkdir -p index
	mkdir -p doc
	read -r -d '' curlcom <<- 'endOfCommand'
	curl 'https://kais.cadastre.bg/bg/OpenData/Read' \
	-H 'Accept: /' \
	-H 'Accept-Language: en-US,en;q=0.9,bg;q=0.8,ar;q=0.7' \
	-H 'Cache-Control: no-cache' \
	-H 'Connection: keep-alive' \
	-H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
	-H $'Cookie: csrf=CfDJ8N1Zr3r-DYZInfQcXSB9Jdu8vN8S8SGkP1qZza2NCmsFm5e2K6Fqk__Y7nEhFe7GlVf2T4jAN7wotc3YahNLDsl7DQsmAjDIXWRPcSVcrgdYXtdKTPT-Q8NC5PtXXfr8Mxj_iOvsoSMk8_aoYKgAmDo; s=CfDJ8N1Zr3r%2BDYZInfQcXSB9JdvuUfWX5HmqQ80Wn2TaUjH6rSaEey18vJJyxlHffxTFXA2dhWjzbUBHu6jHhErk0QJLbWaQYQ5M%2Bzlq%2FlZm6aXZ4E3mHielNAVyE%2FC9WagpR4pcKX2eRdFonA8kyP%2Fl3Pu1%2FYXo45KBan%2BsD6kPe7KE; cookiesgdpr=true; k-cookie=\u0021AIkeD5f4KbhJnBpYaued4ZpDLVf3ZOkIjGNvE1vFIuGRnTXNpeucsLcldRWeMKVzxd2MILZzOEpPLA==; TS01d36efd=01700d78c4508b08d4a924bc26e8ccb01c545be2a2aadcfc225788282cb01aeb5f1ff2976041b57bf22c07dfcf632dd11cd74206fec19d928f1e5a3e8b51c109beff9ae578' \
	-H 'Origin: https://kais.cadastre.bg' \
	-H 'Pragma: no-cache' \
	-H 'Referer: https://kais.cadastre.bg/bg/OpenData' \
	-H 'Sec-Fetch-Dest: empty' \
	-H 'Sec-Fetch-Mode: cors' \
	-H 'Sec-Fetch-Site: same-origin' \
	-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' \
	-H 'X-CSRF-TOKEN: CfDJ8IHdlVPWlvJIkB5g9NxdbAcjm-y5YozDYReNq5ky0i0n6DQ5BLyvSqqkAjL6v-aVYNQva8Cy___6BScVfsjrYd4p3myBVYe0fpHB3WCinqysQZjulNeAn8l45J7Pb6oDCy5UHnwHubQWYbLA2Bc3EbY' \
	-H 'X-Requested-With: XMLHttpRequest' \
	-H 'sec-ch-ua: "Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"' \
	-H 'sec-ch-ua-mobile: ?0' \
	-H 'sec-ch-ua-platform: "Linux"' \
	--data-raw 'path=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81&target=%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82+%D0%91%D1%83%D1%80%D0%B3%D0%B0%D1%81'
	endOfCommand

	curlcom=`echo "$curlcom" \| sed 's_curl_curl -s_;$s_--data-raw.*__'`;
	curlcom1=`echo "$curlcom" \| sed 's_Content-Type:.*_Content-Length: 0'"'"' \\\\_'`;

	if [ ! -f LASTSCRAPE ]; then
	echo "1970-01-01" > LASTSCRAPE;
	fi
	lastscrape=`cat LASTSCRAPE`;
	echo "Last scrape was at $lastscrape";
	lastscrape=`date -d "$lastscrape" +%s`;

	echo "Downloading regions";
	eval "$curlcom1 -X 'POST'" > index/obsh_ref.0.json;

	echo -n "Downloading municipalities ";
	jq -r .[].Path index/obsh_ref.0.json \| { while read i; do
	f=`echo "$i" \| sed 's_област __;s_[()]\+__g'`;
	eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.1.json";
	echo -n ".";
	done }
	echo " done";

	echo -n "Downloading settlements ";
	jq -r .[].Path index/*.1.json \| { while read i; do
	f=`echo "$i" \| sed 's_област \\|община __g;s_[()]\+__g;s_/_-_g'`;
	eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.2.json";
	echo -n ".";
	done }
	echo " done";

	echo -n "Downloading document list ";
	jq -r .[].Path index/*.2.json \| { while read i; do
	f=`echo "$i" \| sed 's_област \\|община \\|с\. \\|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`;
	eval "$curlcom --data-urlencode 'target=$i'" > "index/$f.3.json";
	echo -n ".";
	done }
	echo " done";

	echo "Downloaded index files: "`ls index\|wc -l`" data: "`du -sh index\|sed 's_\s*index__'`;

	echo -n "Downloading archives ";
	jq -r '.[]\|[.Path,.Modified]\|@csv' index/*.3.json \| { while read i; do
	fd=`echo "$i" \| sed 's_.*","__;s_"__g'`;
	fd=`date -d "$fd" +%s`;
	if [ $lastscrape -gt $fd ]; then
	echo -n "x";
	else
	f=`echo "$i"\| sed 's_",".*__;s_"__g;s_област \\|община \\|с\. \\|гр\. __g;s_ (\([0-9]\+\)_-\1_;s_[()]\+__g;s_/_-_g'`;
	d=`echo "$i"\| sed 's_",".__;s_"__g' \| jq -sRr @uri \| sed 's_\.zip._.zip_'`;
	u=`echo "$curlcom" \| sed 's_OpenData/Read_OpenData/Download?path='"$d"'_;$s_\$__'`;
	eval "$u" > "doc/$f";
	echo -n ".";
	fi;
	done }
	echo " done";

	echo "Downloaded doc files: "`ls doc\|wc -l`" data: "`du -sh doc\|sed 's_\s*doc__'`;

	date +"%Y-%m-%d %H:%M" > LASTSCRAPE