Skip to content

Instantly share code, notes, and snippets.

@retlehs
Created April 17, 2026 15:54
Show Gist options
  • Select an option

  • Save retlehs/cf0ac6c74476e766fba2f14076fff501 to your computer and use it in GitHub Desktop.

Select an option

Save retlehs/cf0ac6c74476e766fba2f14076fff501 to your computer and use it in GitHub Desktop.
Backlinks for any domain via Common Crawl
@sujay1599
Copy link
Copy Markdown

#!/usr/bin/env bash
set -euo pipefail

# ──────────────────────────────────────────────────────────────
# cc-backlinks.sh  –  query Common Crawl domain-level backlinks
#
# Usage:  ./cc-backlinks.sh [domain] [top_n]
#   domain  – domain to look up            (default: example.com)
#   top_n   – max linking domains to show  (default: 100)
#
# Env vars:
#   CC_RELEASE   – crawl release slug      (default: cc-main-2026-jan-feb-mar)
#   CC_THREADS   – DuckDB thread count     (default: all cores)
# ──────────────────────────────────────────────────────────────

DOMAIN="${1:-example.com}"
TOP_N="${2:-100}"
RELEASE="${CC_RELEASE:-cc-main-2026-jan-feb-mar}"
THREADS="${CC_THREADS:-0}"   # 0 = DuckDB default (all cores)

CACHE="${HOME}/.cache/cc-backlinks/${RELEASE}"
BASE="https://data.commoncrawl.org/projects/hyperlinkgraph/${RELEASE}/domain"

VERTICES="${CACHE}/domain-vertices.txt.gz"
EDGES="${CACHE}/domain-edges.txt.gz"

mkdir -p "$CACHE"

# ── Preflight checks ──────────────────────────────────────────

if ! command -v duckdb >/dev/null 2>&1; then
  echo "error: duckdb not installed." >&2
  echo "  macOS:  brew install duckdb" >&2
  echo "  Linux:  https://duckdb.org/docs/installation" >&2
  exit 1
fi

if ! command -v curl >/dev/null 2>&1; then
  echo "error: curl not installed." >&2
  exit 1
fi

# ── Reverse domain notation (io.roots for roots.io) ──────────

REV_DOMAIN=$(
  awk -F. '{
    for (i=NF; i>0; i--)
      printf "%s%s", $i, (i>1 ? "." : "")
  }' <<<"$DOMAIN"
)

echo ">> domain:   ${DOMAIN}  (reversed: ${REV_DOMAIN})" >&2
echo ">> release:  ${RELEASE}" >&2
echo ">> cache:    ${CACHE}" >&2

# ── Download helper (resume-safe, skips if already present) ──

download() {
  local url="$1" dest="$2" label
  label=$(basename "$dest")
  if [[ -f "$dest" ]]; then
    echo ">> [skip] ${label} already cached" >&2
    return
  fi
  echo ">> downloading ${label} ..." >&2
  curl -L --fail --progress-bar -C - -o "$dest" "$url" \
    || { echo "error: failed to download ${url}" >&2; rm -f "$dest"; exit 1; }
}

download "${BASE}/${RELEASE}-domain-vertices.txt.gz" "$VERTICES"
download "${BASE}/${RELEASE}-domain-edges.txt.gz"    "$EDGES"

# ── Early exit: confirm domain exists in the graph ───────────

echo ">> checking domain exists in vertices ..." >&2

VERTEX_COUNT=$(duckdb -csv <<SQL
SELECT COUNT(*) FROM read_csv(
  '${VERTICES}',
  delim='\t', header=false,
  columns={'id':'BIGINT','rev_domain':'VARCHAR','num_hosts':'BIGINT'}
) WHERE rev_domain = '${REV_DOMAIN}';
SQL
)

if [[ "$VERTEX_COUNT" -eq 0 ]]; then
  echo "error: '${DOMAIN}' not found in the ${RELEASE} vertex file." >&2
  echo "  The domain may be too small to appear in Common Crawl, or" >&2
  echo "  the reversed form '${REV_DOMAIN}' may not match." >&2
  exit 1
fi

# ── Main query ────────────────────────────────────────────────

echo ">> querying backlinks for ${DOMAIN} ..." >&2
echo ">> NOTE: first run scans ~16 GB of gzipped edges — expect several minutes" >&2

duckdb <<SQL
PRAGMA threads=${THREADS};

.mode box

WITH vertices AS (
  SELECT *
  FROM read_csv(
    '${VERTICES}',
    delim='\t', header=false,
    columns={'id':'BIGINT','rev_domain':'VARCHAR','num_hosts':'BIGINT'}
  )
),

target AS (
  -- Use IN (not =) in case of duplicate rev_domain entries
  SELECT id
  FROM vertices
  WHERE rev_domain = '${REV_DOMAIN}'
),

inbound AS (
  SELECT from_id
  FROM read_csv(
    '${EDGES}',
    delim='\t', header=false,
    columns={'from_id':'BIGINT','to_id':'BIGINT'}
  )
  WHERE to_id IN (SELECT id FROM target)
    AND from_id NOT IN (SELECT id FROM target)  -- exclude self-links
)

SELECT
  array_to_string(
    list_reverse(string_split(v.rev_domain, '.')),
    '.'
  )                          AS linking_domain,
  v.num_hosts                AS host_count,
  COUNT(*)                   AS edge_count
FROM inbound i
JOIN vertices v ON v.id = i.from_id
GROUP BY v.rev_domain, v.num_hosts
ORDER BY v.num_hosts DESC, linking_domain
LIMIT ${TOP_N};
SQL

@dev-kunalchauhan
Copy link
Copy Markdown

Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for techfusiongear.com on your side and send me the output?

@ripgrim
Copy link
Copy Markdown

ripgrim commented Apr 19, 2026

Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for techfusiongear.com on your side and send me the output?

😂😂😂

@Checker9x
Copy link
Copy Markdown

Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for techfusiongear.com on your side and send me the output?

lol

@s0md3v
Copy link
Copy Markdown

s0md3v commented Apr 19, 2026

Hey, thanks for sharing this. My system can’t handle the 16GB download/setup. Could you run it for techfusiongear.com on your side and send me the output?

ts frying me 😭✌️

@sujay1599
Copy link
Copy Markdown

Sorry can’t bro. Run at Best Buy.

@Checker9x
Copy link
Copy Markdown

Or maybe you can run at Ikea I guess.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment