Created
May 27, 2013 08:58
-
-
Save binjoo/5655961 to your computer and use it in GitHub Desktop.
JAVA:清除HTML标签
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class HTMLSpirit{ | |
public static String delHTMLTag(String htmlStr){ | |
String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式 | |
String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式 | |
String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式 | |
Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); | |
Matcher m_script=p_script.matcher(htmlStr); | |
htmlStr=m_script.replaceAll(""); //过滤script标签 | |
Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); | |
Matcher m_style=p_style.matcher(htmlStr); | |
htmlStr=m_style.replaceAll(""); //过滤style标签 | |
Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); | |
Matcher m_html=p_html.matcher(htmlStr); | |
htmlStr=m_html.replaceAll(""); //过滤html标签 | |
return htmlStr.trim(); //返回文本字符串 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
含有小于号"< ",就出现截取现象了,:(