Created
April 17, 2026 15:54
-
Star
(291)
You must be signed in to star a gist -
Fork
(73)
You must be signed in to fork a gist
-
-
Save retlehs/cf0ac6c74476e766fba2f14076fff501 to your computer and use it in GitHub Desktop.
Backlinks for any domain via Common Crawl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| DOMAIN="${1:-example.com}" | |
| RELEASE="${CC_RELEASE:-cc-main-2026-jan-feb-mar}" | |
| CACHE="${HOME}/.cache/cc-backlinks/${RELEASE}" | |
| BASE="https://data.commoncrawl.org/projects/hyperlinkgraph/${RELEASE}/domain" | |
| VERTICES="${CACHE}/domain-vertices.txt.gz" | |
| EDGES="${CACHE}/domain-edges.txt.gz" | |
| mkdir -p "$CACHE" | |
| if ! command -v duckdb >/dev/null; then | |
| echo "error: duckdb not installed. Run: brew install duckdb" >&2 | |
| exit 1 | |
| fi | |
| # Reverse domain: roots.io -> io.roots | |
| REV_DOMAIN=$(awk -F. '{for(i=NF;i>0;i--) printf "%s%s", $i, (i>1?".":"")}' <<<"$DOMAIN") | |
| download() { | |
| local url="$1" dest="$2" | |
| if [[ -f "$dest" ]]; then return; fi | |
| echo ">> downloading $(basename "$dest") ..." >&2 | |
| curl -L --fail -C - -o "$dest" "$url" | |
| } | |
| download "${BASE}/${RELEASE}-domain-vertices.txt.gz" "$VERTICES" | |
| download "${BASE}/${RELEASE}-domain-edges.txt.gz" "$EDGES" | |
| echo ">> querying backlinks to ${DOMAIN} (reversed: ${REV_DOMAIN}) ..." >&2 | |
| echo ">> first run scans ~16 GB of gzipped edges; expect several minutes" >&2 | |
| duckdb <<SQL | |
| .mode box | |
| WITH vertices AS ( | |
| SELECT * FROM read_csv('${VERTICES}', delim='\t', header=false, | |
| columns={'id':'BIGINT','rev_domain':'VARCHAR','num_hosts':'BIGINT'}) | |
| ), | |
| target AS ( | |
| SELECT id FROM vertices WHERE rev_domain = '${REV_DOMAIN}' | |
| ), | |
| inbound AS ( | |
| SELECT from_id FROM read_csv('${EDGES}', delim='\t', header=false, | |
| columns={'from_id':'BIGINT','to_id':'BIGINT'}) | |
| WHERE to_id = (SELECT id FROM target) | |
| ) | |
| SELECT | |
| array_to_string(list_reverse(string_split(v.rev_domain, '.')), '.') AS linking_domain, | |
| v.num_hosts | |
| FROM inbound i | |
| JOIN vertices v ON v.id = i.from_id | |
| ORDER BY v.num_hosts DESC, linking_domain; | |
| SQL |
sujay1599
commented
Apr 19, 2026
Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for techfusiongear.com on your side and send me the output?
Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for
techfusiongear.comon your side and send me the output?
😂😂😂
Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for
techfusiongear.comon your side and send me the output?
lol
Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for
techfusiongear.comon your side and send me the output?
ts frying me 😭✌️
Sorry can’t bro. Run at Best Buy.
Or maybe you can run at Ikea I guess.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment