huaxinjiayou · December 19, 2015 17:29
diff --git a/dom.rb b/dom.rb
 # coding: utf-8

 class HtmlDoc
    attr_reader :root
    def initialize(str = '')
        if str.is_a?(String)
            # 去掉一些特殊字符字符
            [
                /[\r\n]/, # 换行
                /<\s*!--.*?--\s*>/, # 注释
                /<\s*style.*?>.*?<\s*\/style\s*>/i, # 样式
                /<\s*script.*?>.*?<\s*\/script\s*>/i # 脚本
            ].each {|regex| str.gsub!(regex, '')}

            @root = parse(str)
        else
            @root = str
        end
    end

    def inner_html
        str = ''
        return str unless self.root['childNodes']
        self.root['childNodes'].each do |el|
            str += el['nodeName'] == 'text' ? el['nodeValue'] :
                "<#{el['nodeName']}>#{HtmlDoc.new(el).inner_html}</#{el['nodeName']}>"
        end
        str
    end

    def get_element_by_id(sid)
        return self if self.root['id'] == sid
        return nil unless self.root['childNodes']

        self.root['childNodes'].each do |el|
            result = HtmlDoc.new(el).get_element_by_id(sid)
            return result if result
        end
        nil
    end

    def get_element_by_tag_name(stag_name)
        aresult = []
        return aresult unless self.root['childNodes']

        self.root['childNodes'].each do |el|
            aresult << el if el['nodeName'] == stag_name.downcase
            aresult.concat(HtmlDoc.new(el).get_element_by_tag_name(stag_name))
        end

        return aresult
    end

    private
    # 解析html
    def parse(html)
        otree, atag, id = {}, [], 0
        html.split(/(<.*?>)/).each_with_index do |str, index|
            next if str.strip.empty?
            if str =~ /<\s*!?[\w]+.*?>/ # 开始标签或者单一标签
                str.downcase! # 统一小写

                # 将tag分割
                ainfos = str.gsub(/[<>]/, '').strip.split(/\s+/)
                otag = split_info(ainfos, id += 1)
                atag.unshift(otag) if otag # 保存信息
            elsif str =~ /<[^\/]*\/.*?>/ # 结束标签
                str.downcase! # 统一小写

                # 将tag分割
                ainfos = str.gsub(/[<>\/]/, '').strip.split(/\s+/)
                sname = ainfos.first
                nindex = atag.find_index{|otag| otag['nodeName'] == sname && !otag['collapsed']}
                next unless nindex
                otag = atag[nindex]
                otag['collapsed'] = true # 闭合标签

                # 获取子元素，并闭合缺失的标签
                otag['childNodes'] = collapse(atag.slice!(0...nindex)).reverse
            else # 文本节点
                atag.unshift({'_id_' => id += 1, 'nodeName' => 'text', 'nodeValue' => str, 'collapsed' => true})
            end
        end

        collapse(atag)
        root = atag.length == 1 ? atag.first : {'nodeName' => 'root', 'childNodes' => atag.reverse}
    end

    # 保存tag信息
    def split_info(ainfos, nid)
        aignore_tag = '!doctype,meta,link'.split(',')
        asingle_tag = 'area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed'.split(',')

        sname = ainfos.shift # 忽略的tag
        return if aignore_tag.include?(sname)

        bcollapsed = asingle_tag.include?(sname) # 标签是否已经闭合
        otag = {'_id_' => nid, 'nodeName' => sname, 'collapsed' => bcollapsed} # 标签名称
        ainfos.each do |str| # 属性
            if str.gsub(/\s/, '') =~ /([^=]+?)=([^=]+)/
                odata = Regexp.last_match
                otag[odata[1]] = odata[2].gsub(/^["']|["']$/, '')
            end
        end
        otag
    end

    # 闭合缺失的结束标签
    def collapse(atag)
        i = -1
        while (i += 1) < atag.length
            otag = atag[i]
            next if otag['collapsed']
            otag['childNodes'], otag['collapsed'] = atag.slice!(0...i).reverse, true
            i = 0
        end
        atag
    end
 end
	# coding: utf-8

	class HtmlDoc
	attr_reader :root
	def initialize(str = '')
	if str.is_a?(String)
	# 去掉一些特殊字符字符
	[
	/[\r\n]/, # 换行
	/<\s!--.?--\s*>/, # 注释
	/<\sstyle.?>.?<\s\/style\s*>/i, # 样式
	/<\sscript.?>.?<\s\/script\s*>/i # 脚本
	].each {\|regex\| str.gsub!(regex, '')}

	@root = parse(str)
	else
	@root = str
	end
	end

	def inner_html
	str = ''
	return str unless self.root['childNodes']
	self.root['childNodes'].each do \|el\|
	str += el['nodeName'] == 'text' ? el['nodeValue'] :
	"<#{el['nodeName']}>#{HtmlDoc.new(el).inner_html}</#{el['nodeName']}>"
	end
	str
	end

	def get_element_by_id(sid)
	return self if self.root['id'] == sid
	return nil unless self.root['childNodes']

	self.root['childNodes'].each do \|el\|
	result = HtmlDoc.new(el).get_element_by_id(sid)
	return result if result
	end
	nil
	end

	def get_element_by_tag_name(stag_name)
	aresult = []
	return aresult unless self.root['childNodes']

	self.root['childNodes'].each do \|el\|
	aresult << el if el['nodeName'] == stag_name.downcase
	aresult.concat(HtmlDoc.new(el).get_element_by_tag_name(stag_name))
	end

	return aresult
	end

	private
	# 解析html
	def parse(html)
	otree, atag, id = {}, [], 0
	html.split(/(<.*?>)/).each_with_index do \|str, index\|
	next if str.strip.empty?
	if str =~ /<\s!?[\w]+.?>/ # 开始标签或者单一标签
	str.downcase! # 统一小写

	# 将tag分割
	ainfos = str.gsub(/[<>]/, '').strip.split(/\s+/)
	otag = split_info(ainfos, id += 1)
	atag.unshift(otag) if otag # 保存信息
	elsif str =~ /<[^\/]\/.?>/ # 结束标签
	str.downcase! # 统一小写

	# 将tag分割
	ainfos = str.gsub(/[<>\/]/, '').strip.split(/\s+/)
	sname = ainfos.first
	nindex = atag.find_index{\|otag\| otag['nodeName'] == sname && !otag['collapsed']}
	next unless nindex
	otag = atag[nindex]
	otag['collapsed'] = true # 闭合标签

	# 获取子元素，并闭合缺失的标签
	otag['childNodes'] = collapse(atag.slice!(0...nindex)).reverse
	else # 文本节点
	atag.unshift({'_id_' => id += 1, 'nodeName' => 'text', 'nodeValue' => str, 'collapsed' => true})
	end
	end

	collapse(atag)
	root = atag.length == 1 ? atag.first : {'nodeName' => 'root', 'childNodes' => atag.reverse}
	end

	# 保存tag信息
	def split_info(ainfos, nid)
	aignore_tag = '!doctype,meta,link'.split(',')
	asingle_tag = 'area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed'.split(',')

	sname = ainfos.shift # 忽略的tag
	return if aignore_tag.include?(sname)

	bcollapsed = asingle_tag.include?(sname) # 标签是否已经闭合
	otag = {'_id_' => nid, 'nodeName' => sname, 'collapsed' => bcollapsed} # 标签名称
	ainfos.each do \|str\| # 属性
	if str.gsub(/\s/, '') =~ /([^=]+?)=([^=]+)/
	odata = Regexp.last_match
	otag[odata[1]] = odata[2].gsub(/^["']\|["']$/, '')
	end
	end
	otag
	end

	# 闭合缺失的结束标签
	def collapse(atag)
	i = -1
	while (i += 1) < atag.length
	otag = atag[i]
	next if otag['collapsed']
	otag['childNodes'], otag['collapsed'] = atag.slice!(0...i).reverse, true
	i = 0
	end
	atag
	end
	end
No results found