Created
March 8, 2017 12:55
-
-
Save anonymous/167c2912fe0746f4b05b1fec51ae0f14 to your computer and use it in GitHub Desktop.
HK text extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "hk" | |
version = "0.0.1" | |
[dependencies] | |
byteorder = "1.0" | |
xml-rs = "0.4" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// this file has to be put to src/main.rs in the directory where Cargo.toml resides | |
extern crate byteorder; | |
extern crate xml; | |
use std::env; | |
use std::fs::{self, File}; | |
use std::io::{self, BufReader, BufRead, BufWriter, Read, ErrorKind, Write}; | |
use std::mem; | |
use std::path::Path; | |
use byteorder::{LittleEndian, ReadBytesExt}; | |
use xml::reader::{ParserConfig, XmlEvent}; | |
fn main() { | |
let mut args: Vec<_> = env::args().skip(1).take(2).collect(); | |
let (source_file, dest_dir) = if args.len() == 2 { | |
let dest_dir = args.pop().unwrap(); | |
let source_file = args.pop().unwrap(); | |
(source_file, dest_dir) | |
} else { | |
panic!("Expected exactly two arguments: <source> <dest>"); | |
}; | |
fs::create_dir_all(&dest_dir).unwrap(); | |
let mut source = BufReader::new(File::open(&source_file).unwrap()); | |
macro_rules! break_if_none { | |
($val:expr) => { | |
match $val { | |
Some(v) => v, | |
None => break, | |
} | |
} | |
} | |
loop { | |
let name_length = break_if_none!(read_length(&mut source)); | |
let mut name_block = vec![0u8; name_length as usize]; | |
source.read_exact(&mut name_block).expect("Failed to read name block"); | |
skip_zeros(&mut source).expect("Failed to skip zeros after the name block"); | |
let data_length = break_if_none!(read_length(&mut source)); | |
let mut data_block = vec![0u8; data_length as usize]; | |
source.read_exact(&mut data_block).expect("Failed to read data block"); | |
skip_zeros(&mut source).expect("Failed to skip zeros after the data block"); | |
process_item(name_block, data_block, &dest_dir); | |
} | |
} | |
const INVALID_PREFIXES: &'static [&'static str] = &[ | |
"ES", "FR", "DE", | |
"Word Count", "_", "LineBreaking" | |
]; | |
fn process_item(name_block: Vec<u8>, data_block: Vec<u8>, dest_dir: &str) { | |
let prefixed_name = String::from_utf8_lossy(&name_block); | |
if INVALID_PREFIXES.iter().cloned().find(|&p| prefixed_name.starts_with(p)).is_some() { | |
return; | |
} | |
let mut parts: Vec<_> = prefixed_name.splitn(2, "_").collect(); | |
if parts.len() != 2 { | |
println!("Strange name: {}", prefixed_name); | |
return; | |
} | |
let name: &str = parts.pop().unwrap(); | |
let prefix: &str = parts.pop().unwrap(); | |
let mut entries = parse_entries(data_block); | |
entries.sort_by(|p1, p2| p1.0.cmp(&p2.0)); | |
println!("{} in {}: {} entries", name, prefix, entries.len()); | |
store_entries(dest_dir, prefix, name, entries); | |
} | |
fn store_entries(dest_dir: &str, subdir_name: &str, file_name: &str, | |
entries: Vec<(String, String)>) { | |
let dir_path = Path::new(dest_dir).join(subdir_name); | |
fs::create_dir_all(&dir_path) | |
.expect(&format!("Failed to create directory {}", dir_path.display())); | |
let file_name = file_name.to_owned() + ".txt"; | |
let file_path = dir_path.join(&file_name); | |
let mut file = BufWriter::new(File::create(&file_path) | |
.expect(&format!("Failed to create file {}", file_path.display()))); | |
let mut print_entry = |entry_name: String, entry_text: String| -> io::Result<()> { | |
for _ in 0..8 { | |
file.write_all(b"----------")?; | |
} | |
file.write_all(b"\n")?; | |
file.write_all(entry_name.as_bytes())?; | |
file.write_all(b":\n\n")?; | |
let entry_text = entry_text.replace("<br>", "\n"); | |
let entry_text = entry_text.replace("<page>", "\n\n"); | |
file.write_all(entry_text.as_bytes())?; | |
if !entry_text.ends_with("\n") { | |
file.write_all(b"\n")?; | |
} | |
Ok(()) | |
}; | |
for (entry_name, entry_text) in entries { | |
print_entry(entry_name, entry_text) | |
.expect(&format!("failed to write an entry to file {}", file_path.display())); | |
} | |
} | |
fn parse_entries(data_block: Vec<u8>) -> Vec<(String, String)> { | |
let mut result = Vec::new(); | |
let parser = ParserConfig::new() | |
.whitespace_to_characters(true) | |
.coalesce_characters(true) | |
.cdata_to_characters(true) | |
.create_reader(&*data_block); | |
let mut entry_name = String::new(); | |
for e in parser { | |
let e = match e { | |
Ok(e) => e, | |
Err(e) => { | |
println!("Failed to read an XML document: {}", e); | |
return result; | |
} | |
}; | |
match e { | |
XmlEvent::StartElement { name, attributes, .. } => { | |
if name.local_name == "entry" { | |
match attributes.into_iter().find(|attr| attr.name.local_name == "name") { | |
Some(attr) => entry_name = attr.value, | |
None => println!("Failed to determine entry name, attribute is missing"), | |
} | |
} | |
}, | |
XmlEvent::Characters(text) => { | |
if !entry_name.is_empty() { | |
result.push((mem::replace(&mut entry_name, String::new()), text)); | |
} | |
}, | |
_ => {} | |
} | |
} | |
result | |
} | |
fn skip_zeros<R: BufRead>(r: &mut R) -> io::Result<usize> { | |
let mut read = 0; | |
loop { | |
let (done, used) = { | |
let available = match r.fill_buf() { | |
Ok(n) => n, | |
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, | |
Err(e) => return Err(e) | |
}; | |
match available.iter().position(|&b| b != 0) { | |
Some(i) => (true, i), | |
None => (false, available.len()), | |
} | |
}; | |
r.consume(used); | |
read += used; | |
if done || used == 0 { | |
return Ok(read); | |
} | |
} | |
} | |
fn read_length<R: Read>(r: &mut R) -> Option<u32> { | |
match r.read_u32::<LittleEndian>() { | |
Ok(length) => Some(length), | |
Err(ref e) if e.kind() == ErrorKind::UnexpectedEof => None, | |
Err(e) => panic!("Failed to read length: {}", e), | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment