Last active
December 19, 2015 17:29
-
-
Save huaxinjiayou/098bfa3f5f9794388db2 to your computer and use it in GitHub Desktop.
ruby document文本简单解析
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
class HtmlDoc | |
attr_reader :root | |
def initialize(str = '') | |
if str.is_a?(String) | |
# 去掉一些特殊字符字符 | |
[ | |
/[\r\n]/, # 换行 | |
/<\s*!--.*?--\s*>/, # 注释 | |
/<\s*style.*?>.*?<\s*\/style\s*>/i, # 样式 | |
/<\s*script.*?>.*?<\s*\/script\s*>/i # 脚本 | |
].each {|regex| str.gsub!(regex, '')} | |
@root = parse(str) | |
else | |
@root = str | |
end | |
end | |
def inner_html | |
str = '' | |
return str unless self.root['childNodes'] | |
self.root['childNodes'].each do |el| | |
str += el['nodeName'] == 'text' ? el['nodeValue'] : | |
"<#{el['nodeName']}>#{HtmlDoc.new(el).inner_html}</#{el['nodeName']}>" | |
end | |
str | |
end | |
def get_element_by_id(sid) | |
return self if self.root['id'] == sid | |
return nil unless self.root['childNodes'] | |
self.root['childNodes'].each do |el| | |
result = HtmlDoc.new(el).get_element_by_id(sid) | |
return result if result | |
end | |
nil | |
end | |
def get_element_by_tag_name(stag_name) | |
aresult = [] | |
return aresult unless self.root['childNodes'] | |
self.root['childNodes'].each do |el| | |
aresult << el if el['nodeName'] == stag_name.downcase | |
aresult.concat(HtmlDoc.new(el).get_element_by_tag_name(stag_name)) | |
end | |
return aresult | |
end | |
private | |
# 解析html | |
def parse(html) | |
otree, atag, id = {}, [], 0 | |
html.split(/(<.*?>)/).each_with_index do |str, index| | |
next if str.strip.empty? | |
if str =~ /<\s*!?[\w]+.*?>/ # 开始标签或者单一标签 | |
str.downcase! # 统一小写 | |
# 将tag分割 | |
ainfos = str.gsub(/[<>]/, '').strip.split(/\s+/) | |
otag = split_info(ainfos, id += 1) | |
atag.unshift(otag) if otag # 保存信息 | |
elsif str =~ /<[^\/]*\/.*?>/ # 结束标签 | |
str.downcase! # 统一小写 | |
# 将tag分割 | |
ainfos = str.gsub(/[<>\/]/, '').strip.split(/\s+/) | |
sname = ainfos.first | |
nindex = atag.find_index{|otag| otag['nodeName'] == sname && !otag['collapsed']} | |
next unless nindex | |
otag = atag[nindex] | |
otag['collapsed'] = true # 闭合标签 | |
# 获取子元素,并闭合缺失的标签 | |
otag['childNodes'] = collapse(atag.slice!(0...nindex)).reverse | |
else # 文本节点 | |
atag.unshift({'_id_' => id += 1, 'nodeName' => 'text', 'nodeValue' => str, 'collapsed' => true}) | |
end | |
end | |
collapse(atag) | |
root = atag.length == 1 ? atag.first : {'nodeName' => 'root', 'childNodes' => atag.reverse} | |
end | |
# 保存tag信息 | |
def split_info(ainfos, nid) | |
aignore_tag = '!doctype,meta,link'.split(',') | |
asingle_tag = 'area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed'.split(',') | |
sname = ainfos.shift # 忽略的tag | |
return if aignore_tag.include?(sname) | |
bcollapsed = asingle_tag.include?(sname) # 标签是否已经闭合 | |
otag = {'_id_' => nid, 'nodeName' => sname, 'collapsed' => bcollapsed} # 标签名称 | |
ainfos.each do |str| # 属性 | |
if str.gsub(/\s/, '') =~ /([^=]+?)=([^=]+)/ | |
odata = Regexp.last_match | |
otag[odata[1]] = odata[2].gsub(/^["']|["']$/, '') | |
end | |
end | |
otag | |
end | |
# 闭合缺失的结束标签 | |
def collapse(atag) | |
i = -1 | |
while (i += 1) < atag.length | |
otag = atag[i] | |
next if otag['collapsed'] | |
otag['childNodes'], otag['collapsed'] = atag.slice!(0...i).reverse, true | |
i = 0 | |
end | |
atag | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment