Skip to content

Instantly share code, notes, and snippets.

@huaxinjiayou
Last active December 19, 2015 17:29
Show Gist options
  • Save huaxinjiayou/098bfa3f5f9794388db2 to your computer and use it in GitHub Desktop.
Save huaxinjiayou/098bfa3f5f9794388db2 to your computer and use it in GitHub Desktop.
ruby document文本简单解析
# coding: utf-8
class HtmlDoc
attr_reader :root
def initialize(str = '')
if str.is_a?(String)
# 去掉一些特殊字符字符
[
/[\r\n]/, # 换行
/<\s*!--.*?--\s*>/, # 注释
/<\s*style.*?>.*?<\s*\/style\s*>/i, # 样式
/<\s*script.*?>.*?<\s*\/script\s*>/i # 脚本
].each {|regex| str.gsub!(regex, '')}
@root = parse(str)
else
@root = str
end
end
def inner_html
str = ''
return str unless self.root['childNodes']
self.root['childNodes'].each do |el|
str += el['nodeName'] == 'text' ? el['nodeValue'] :
"<#{el['nodeName']}>#{HtmlDoc.new(el).inner_html}</#{el['nodeName']}>"
end
str
end
def get_element_by_id(sid)
return self if self.root['id'] == sid
return nil unless self.root['childNodes']
self.root['childNodes'].each do |el|
result = HtmlDoc.new(el).get_element_by_id(sid)
return result if result
end
nil
end
def get_element_by_tag_name(stag_name)
aresult = []
return aresult unless self.root['childNodes']
self.root['childNodes'].each do |el|
aresult << el if el['nodeName'] == stag_name.downcase
aresult.concat(HtmlDoc.new(el).get_element_by_tag_name(stag_name))
end
return aresult
end
private
# 解析html
def parse(html)
otree, atag, id = {}, [], 0
html.split(/(<.*?>)/).each_with_index do |str, index|
next if str.strip.empty?
if str =~ /<\s*!?[\w]+.*?>/ # 开始标签或者单一标签
str.downcase! # 统一小写
# 将tag分割
ainfos = str.gsub(/[<>]/, '').strip.split(/\s+/)
otag = split_info(ainfos, id += 1)
atag.unshift(otag) if otag # 保存信息
elsif str =~ /<[^\/]*\/.*?>/ # 结束标签
str.downcase! # 统一小写
# 将tag分割
ainfos = str.gsub(/[<>\/]/, '').strip.split(/\s+/)
sname = ainfos.first
nindex = atag.find_index{|otag| otag['nodeName'] == sname && !otag['collapsed']}
next unless nindex
otag = atag[nindex]
otag['collapsed'] = true # 闭合标签
# 获取子元素,并闭合缺失的标签
otag['childNodes'] = collapse(atag.slice!(0...nindex)).reverse
else # 文本节点
atag.unshift({'_id_' => id += 1, 'nodeName' => 'text', 'nodeValue' => str, 'collapsed' => true})
end
end
collapse(atag)
root = atag.length == 1 ? atag.first : {'nodeName' => 'root', 'childNodes' => atag.reverse}
end
# 保存tag信息
def split_info(ainfos, nid)
aignore_tag = '!doctype,meta,link'.split(',')
asingle_tag = 'area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed'.split(',')
sname = ainfos.shift # 忽略的tag
return if aignore_tag.include?(sname)
bcollapsed = asingle_tag.include?(sname) # 标签是否已经闭合
otag = {'_id_' => nid, 'nodeName' => sname, 'collapsed' => bcollapsed} # 标签名称
ainfos.each do |str| # 属性
if str.gsub(/\s/, '') =~ /([^=]+?)=([^=]+)/
odata = Regexp.last_match
otag[odata[1]] = odata[2].gsub(/^["']|["']$/, '')
end
end
otag
end
# 闭合缺失的结束标签
def collapse(atag)
i = -1
while (i += 1) < atag.length
otag = atag[i]
next if otag['collapsed']
otag['childNodes'], otag['collapsed'] = atag.slice!(0...i).reverse, true
i = 0
end
atag
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment