Skip to content

Instantly share code, notes, and snippets.

@robertz
Created October 2, 2023 12:40
Show Gist options
  • Save robertz/2a5885fe0ccbb48d8ffc8cf633b9c995 to your computer and use it in GitHub Desktop.
Save robertz/2a5885fe0ccbb48d8ffc8cf633b9c995 to your computer and use it in GitHub Desktop.
Read page metadata using jSoup in ColdFusion
component {
property name="jSoup" inject="javaLoader:org.jsoup.Jsoup";
function spider( required string link ){
var meta = { "url" : arguments.link, "alt_images" : [] };
try {
var jsDoc = jSoup
.connect( link )
.followRedirects( true )
.userAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0" )
.get();
var el = jsDoc.select( "meta" );
var filtered = el.filter( ( item ) => {
return item.attr( "name" ).find( "twitter:" ) ||
item.attr( "name" ).find( "og:" ) ||
item.attr( "property" ).find( "twitter:" ) ||
item.attr( "property" ).find( "og:" );
} );
filtered.each( function( i ){
len( i.attr( "name" ) ) ? meta[ i.attr( "name" ) ] = i.attr( "content" ) : meta[ i.attr( "property" ) ] = i.attr( "content" );
} );
if ( !meta.keyExists( "image" ) ) {
el = jsDoc.select( "img" );
el.each( function( item ){
if ( item.attributes().hasKey( "src" ) ) {
if (
item.attr( "src" ).findNoCase( ".jpg" ) || item.attr( "src" ).findNoCase( ".jpeg" ) || item
.attr( "src" )
.findNoCase( ".gif" )
) {
meta.alt_images.append( item.attr( "src" ) );
}
} else if ( item.attributes().hasKey( "data-img-url" ) ) {
if (
item.attr( "data-img-url" ).findNoCase( ".jpg" ) || item
.attr( "data-img-url" )
.findNoCase( ".jpeg" ) || item.attr( "data-img-url" ).findNoCase( ".gif" )
) {
meta.alt_images.append( item.attr( "data-img-url" ) );
}
}
} );
}
} catch ( any e ) {
// todo: log exception
}
return meta;
}
function extract_text( required string html ){
var jsDoc = jSoup.parse( html );
var ps = jsDoc.select( "p" );
var t = [];
ps.each( function( el ){
t.append( "<p>" & el.text() & "</p>" );
} )
res = arrayToList( t, "" );
return res;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment