Skip to content

Instantly share code, notes, and snippets.

@chuchuva
Last active August 16, 2019 01:29
Show Gist options
  • Save chuchuva/6893062c8bfd1764572a729a782018de to your computer and use it in GitHub Desktop.
Save chuchuva/6893062c8bfd1764572a729a782018de to your computer and use it in GitHub Desktop.
HtmlExtractor lolcats
using HtmlAgilityPack;
using Newtonsoft.Json.Linq;
using System.IO;
using System.Linq;
namespace HtmlExtractor
{
class Program
{
static void Main(string[] args)
{
var document = new HtmlWeb().Load(args[0]);
var nodes = document.DocumentNode.SelectNodes("//p[contains(@class,'mine_image')]");
var lolcats = new JArray(nodes.Select(node => {
string caption = null;
var captionNode = node.SelectSingleNode("following-sibling::p");
if (captionNode != null)
{
caption = captionNode.InnerText;
}
return new JObject(
new JProperty("image", node.SelectSingleNode(".//img").GetAttributeValue("src", "")),
new JProperty("caption", caption));
}));
File.WriteAllText("lolcats.json", lolcats.ToString());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment