aquaflamingo · January 12, 2025 18:40
diff --git a/web2context.zsh b/web2context.zsh
 #!/bin/zsh

 function web2context() {
    if [[ $# -eq 0 ]]; then
        echo "Usage: web2context <url>"
        return 1
    fi

    url="$1"
    temp_dir=$(mktemp -d)
    output_dir="$PWD/web_content"

    echo "Creating output directory at $output_dir"
    mkdir -p "$output_dir"

    echo "Downloading content from $url..."
    cd "$temp_dir" || exit 1
    
    domain=$(echo "$url" | sed -E 's#^https?://([^/]+).*#\1#')
    echo "Downloading from domain: $domain"
    
    wget \
      --recursive \
      --level=1 \
      --no-clobber \
      --page-requisites \
      --html-extension \
      --convert-links \
      --restrict-file-names=windows \
      --domains="$domain" \
      --no-parent \
      "$url"

    find . -type f -name "*.js" -delete
    find . -type f -name "*.css" -delete

    find . -type f \( -name "*.html" -o -name "*.htm" \) | while read -r file; do
        sed -i.bak '
            s/<script[^>]*>.*<\/script>//g
            s/<style[^>]*>.*<\/style>//g
            s/<link[^>]*stylesheet[^>]*>//g
        ' "$file"
        rm -f "${file}.bak"
    done

    if [[ -d "$domain" ]]; then
        echo "Moving files from $domain to output directory"
        cp -r "$domain" "$output_dir/"
        cd "$output_dir" || exit 1
        convert_html_directory "$domain"
        
        # Flatten directory structure - move all txt files to output_dir
        find "$domain" -type f -name "*.txt" -exec mv {} . \;
        
        # Create ALL.txt from all text files
        cat *.txt > ALL.txt
        
        # Remove HTML files and clean up directories
        rm -rf "$domain"
        
        echo "Processing complete! All text files are in $output_dir"
        echo "Combined text saved in $output_dir/ALL.txt"
    else
        echo "Error: Website content not downloaded properly. Contents of temp dir:"
        ls -la
        echo "Domain was: $domain"
    fi

    rm -rf "$temp_dir"
 }

 function convert_html_directory() {
    if [[ $# -eq 0 ]]; then
        echo "Usage: convert_html_directory <directory>"
        return 1
    fi

    website_dir="$1"

    function _html_to_text() {
        html_file="$1"
        text_file="${html_file:r}.txt"
        
        if command -v lynx &> /dev/null; then
            lynx -dump -nolist "$html_file" > "$text_file"
        else
            sed '
                s/<script[^>]*>.*<\/script>//g
                s/<style[^>]*>.*<\/style>//g
                s/<[^>]*>//g
                s/&nbsp;/ /g
                s/&lt;/</g
                s/&gt;/>/g
                s/&amp;/\&/g
                s/^[[:space:]]*//
                /^$/d
            ' "$html_file" > "$text_file"
        fi
        
        echo "Converted: $html_file -> $text_file"
    }

    find "$website_dir" -type f \( -name "*.html" -o -name "*.htm" \) | while read -r file; do
        _html_to_text "$file"
    done
 }
diff --git a/web2context_README.md b/web2context_README.md
	#!/bin/zsh

	function web2context() {
	if [[ $# -eq 0 ]]; then
	echo "Usage: web2context <url>"
	return 1
	fi

	url="$1"
	temp_dir=$(mktemp -d)
	output_dir="$PWD/web_content"

	echo "Creating output directory at $output_dir"
	mkdir -p "$output_dir"

	echo "Downloading content from $url..."
	cd "$temp_dir" \|\| exit 1

	domain=$(echo "$url" \| sed -E 's#^https?://([^/]+).*#\1#')
	echo "Downloading from domain: $domain"

	wget \
	--recursive \
	--level=1 \
	--no-clobber \
	--page-requisites \
	--html-extension \
	--convert-links \
	--restrict-file-names=windows \
	--domains="$domain" \
	--no-parent \
	"$url"

	find . -type f -name "*.js" -delete
	find . -type f -name "*.css" -delete

	find . -type f \( -name ".html" -o -name ".htm" \) \| while read -r file; do
	sed -i.bak '
	s/<script[^>]>.<\/script>//g
	s/<style[^>]>.<\/style>//g
	s/<link[^>]stylesheet[^>]>//g
	' "$file"
	rm -f "${file}.bak"
	done

	if [[ -d "$domain" ]]; then
	echo "Moving files from $domain to output directory"
	cp -r "$domain" "$output_dir/"
	cd "$output_dir" \|\| exit 1
	convert_html_directory "$domain"

	# Flatten directory structure - move all txt files to output_dir
	find "$domain" -type f -name "*.txt" -exec mv {} . \;

	# Create ALL.txt from all text files
	cat *.txt > ALL.txt

	# Remove HTML files and clean up directories
	rm -rf "$domain"

	echo "Processing complete! All text files are in $output_dir"
	echo "Combined text saved in $output_dir/ALL.txt"
	else
	echo "Error: Website content not downloaded properly. Contents of temp dir:"
	ls -la
	echo "Domain was: $domain"
	fi

	rm -rf "$temp_dir"
	}

	function convert_html_directory() {
	if [[ $# -eq 0 ]]; then
	echo "Usage: convert_html_directory <directory>"
	return 1
	fi

	website_dir="$1"

	function _html_to_text() {
	html_file="$1"
	text_file="${html_file:r}.txt"

	if command -v lynx &> /dev/null; then
	lynx -dump -nolist "$html_file" > "$text_file"
	else
	sed '
	s/<script[^>]>.<\/script>//g
	s/<style[^>]>.<\/style>//g
	s/<[^>]*>//g
	s/ / /g
	s/</</g
	s/>/>/g
	s/&/\&/g
	s/^[[:space:]]*//
	/^$/d
	' "$html_file" > "$text_file"
	fi

	echo "Converted: $html_file -> $text_file"
	}

	find "$website_dir" -type f \( -name ".html" -o -name ".htm" \) \| while read -r file; do
	_html_to_text "$file"
	done
	}