Skip to content

Instantly share code, notes, and snippets.

@jxtx
Created September 17, 2014 15:58

Revisions

  1. jxtx created this gist Sep 17, 2014.
    57 changes: 57 additions & 0 deletions scrape_gs.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    /**
    * usage: node scrape_gs.js USERKEY
    *
    * Determine h-index for papers published AFTER each year found in a Google
    * scholar profile. The USERKEY is found in your Google scholar citations
    * page url.
    */

    var request = require('request');
    var cheerio = require('cheerio');
    var _ = require('underscore');

    var arguments = process.argv.slice(2);
    var userKey = arguments[0];

    var url = "http://scholar.google.com/citations?user=" + userKey + "&pagesize=1000"

    request(url, function(err, resp, body) {

    $ = cheerio.load(body);

    var values = $(".gsc_a_tr").map( function() {
    var title = $(this).find( "a.gsc_a_at" ).text();
    var citations = parseInt( $(this).find( "a.gsc_a_ac" ).text() ) || 0;
    var year = parseInt( $(this).find( "span.gsc_a_h" ).text() ) || 0;
    // console.log( title, year, citations );
    return { year: year, citations: citations };
    });

    console.log( "Year TotalCitations h-index" );

    var byYear = _.groupBy( values, "year" );

    var cumCitations = []

    _.each( _.keys( byYear ).sort().reverse(), function( year ) {

    cumCitations = cumCitations.concat(
    _.map( byYear[ year ], function ( v ) { return v.citations } ) );

    cumCitations.sort( function( a, b ){ return a - b } ).reverse()

    var h = 0;
    for ( var i = 0; i < cumCitations.length; i++ ) {
    if ( i >= cumCitations[i] ) {
    h = i;
    break;
    }
    }

    var totalCitations = _.reduce( cumCitations, function( a, b ) { return a + b }, 0 );

    console.log( year, totalCitations, h );

    });

    });