Sebastian Nagel sebastian-nagel

correlation metrics between character set and content language

	WARC/1.0
	WARC-Type: metadata
	WARC-Target-URI: https://en.wikipedia.org/wiki/Saturn
	WARC-Date: 2024-12-11T20:20:04Z
	WARC-Record-ID: <urn:uuid:74b1614e-97bb-4a19-b02f-defc603ab81c>
	WARC-Refers-To: <urn:uuid:90f1a666-d5ba-4e8d-806d-4d848e77a0f8>
	Content-Type: application/json
	Content-Length: 1910

	{

	### Jython
	# install Jython (see https://www.jython.org/download)
	wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar

	# clone pywebgraph (fork with modifications)
	git clone https://github.com/commoncrawl/py-web-graph.git
	cd py-web-graph
	# copy console.py into current working directory so that "pywebgraph" is visible as package
	cp pywebgraph/console.py .

	from warcio.archiveiterator import ArchiveIterator

	with open('path/to/file.wet.gz', 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'conversion':
	url = record.rec_headers.get_header('WARC-Target-URI')
	text = record.content_stream().read().decode('utf-8')

	<?xml version="1.0" encoding="UTF-8"?>
	<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	<sitemap>
	<loc>
	<![CDATA[ http://www.example.com/sitemap1.xml ]]>
	</loc>
	<lastmod>
	<![CDATA[ 2018-12-12 02:06:56 ]]>
	</lastmod>
	</sitemap>

	#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \
	# \| jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \
	# \| sort \| uniq -c \| sort -k1,1nr
	# see also:
	# https://github.com/commoncrawl/ia-web-commons/issues/9
	# https://github.com/commoncrawl/ia-web-commons/issues/8
	# https://github.com/iipc/webarchive-commons/pull/72
	7777908 A@/href
	1266284 IMG@/src
	90022 STYLE/#text

	import fileinput
	import sys

	import boto3
	import botocore

	import ujson as json


	no_sign_request = botocore.client.Config(

	# hanging executor on Spark 2.1.0 and Python 2.7

	from pyspark import SparkContext


	class BadEncodedException(Exception):
	def __init__(self, reason):
	self.msg = str(reason)
	super(BadEncodedException, self).__init__(self.msg)

	#!/bin/bash

	#### extract news sites from DMOZ.org ####

	# dependencies
	# Linux
	# bash
	# wget
	# perl
	# regexp-assemble

	import fileinput
	import sys
	import tldextract
	from _collections import defaultdict
	from math import log


	RANK_DIVERGENCE_THR = 0.02
	HOST_LENGTH_DIVERGENCE_THR = 0.15