富⽂本编辑器过滤XSS注⼊(JSOUP)
众所周知,让⽤户在富⽂本编辑器中进⾏⾃⼰的输⼊绝对不是⼀个明智的选择,但是有的时候⼜没有办法,所以只有⼀条原则来保证系统的安全性,那就是我们让⽤户输⼊什么,⽤户才能输⼊什么,⽽不是⽤户想输⼊什么,他就能输⼊什么,这样才能让系统处于我们的掌控,不⾄于出现各种娄⼦,⽐如各种XSS注⼊什么的。
后来我们发现有⼀个⽐较好⽤的东西就是JSOUP,这是⼀个能够对输⼊的html进⾏过滤,简单来说就是可以增加⽩名单和⿊名单(基于正则表达式),⽩名单就是只允许⼀个html标签上有固定的属性,⽐如我们只允许<div height="100" >,即div上只允许有height属性,其他的都是⾮法的我们认为,就可以⽤jsoup设置⽩名单进⾏过滤。我们也可以设置⿊名单,即我们觉得<div>标签什么属性都可以有,但是style标签我们不能控制,认为他是个⿊名单,我们也可以⽤jsoup进⾏实现。
下⾯贴出⼀个样例:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import Matcher;
import Pattern;
import net.sf.json.JSONObject;
import net.sf.json.JsonConfig;
import org.apachemons.io.IOUtils;
import org.jsoup.Jsoup;
import des.Document;
import des.Document.OutputSettings;
import des.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import io.ClassPathResource;
import io.Resource;
import dules.mapper.JsonMapper;
public class HTMLStringFilter {
private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<;开头以>结尾的标签
private final static String PICTURE = "[图⽚]";
//private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // 出IMG标签
//private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // 出IMG标签的SRC属性
public HTMLStringFilter() {
}
public static String HTMLEncode(String fString){
placeAll(" <", "<");
placeAll(">", ">");
placeAll(new String(new char[]{32}), " ");
placeAll(new String(new char[]{9}), " ");
placeAll(new String(new char[]{34}), """);
placeAll(new String(new char[]{39}), "'");
placeAll(new String(new char[]{13}), "");
placeAll(new String(new char[]{10,10}), " </p> <p>");
placeAll(new String(new char[]{10}), " <br>");
return fString;
/**
* xss escape
*/
public static String xssEscape(String input) {
return input == null ? null : placeAll("<", "<")
.
replaceAll(">", ">")
//    .replaceAll("eval\\((.*)\\)", "")
//    .replaceAll("[\"'][\\s]*((?i)javascript):(.*)[\"']", "\"\"")
//    .replaceAll("((?i)script)", "")
;
}
/**
* 除指定标签之外的html标签编码
* @param str
* @param tag
* @return
*/
public static String xssEscapeExceptTag(String str,String tag) {
String replaceTag="@"+tag+"@";
placeAll("<"+tag,replaceTag );
str=xssEscape(str);
placeAll(replaceTag, "<"+tag);
return str;
}
public static void main(String[] args){
//  System.out.println(new java.util.Date().getTime());
//  System.out.println(HTMLStringFilter.filterSafe("< script >ddd</div>"));
/
/  System.out.println(HTMLStringFilter.filterSafe("< div >ddd</div>"));
//  System.out.println("======"+HTMLStringFilter.filterSafe("< div oncliCk=''><img src='s.jsp'/>ddd</div>"));
//
//  String imgHTML="<img src=\"http:\"/>";
//  String tag="img";
//  System.out.println("filter except:"+filterHtmlExceptTag(imgHTML, tag));
//
//  System.out.println(new java.util.Date().getTime());
//
//  String source="aaaaa<img alt=\"[可爱]\" src=\"sinajs/t4/appstyle/expression/ext/normal/14/tza_thumb.gif\" height=\"22\" width=\"
22\" />bbbb<img a //  String title=replaceTag(source, "img", "alt");
//  System.out.println("title=="+title);
//
//  String s="<img src=\"img7.9158/200708/10/09/18/200708103758836.jpg\"/>";
//  List<String> srcs=match(source, "img", "src");
//  if (CollectionUtils.isNotEmpty(srcs)) {
//  for (String att : srcs) {
//    System.out.println("attr=="+att);
//  }
//  }
//
/
/  System.out.println("html标签替换=="+replaceHtmlTagOfText(s, "img", "[图⽚]"));
//
String htmlStr="<html>bb<img style='display:inline;' alt='[挤眼]' src='sinajs/t4/appstyle/expression/ext/normal/c3/zy_thumb.gif' height='22' width='22  List<String> srcs=getImgHTML(htmlStr);
for (String src : srcs) {
System.out.println("======="+src);
}
//  System.out.println("=HTMLEncode=="+);
//  List<String> htmls=getImgHTML(htmlStr);
//  List<String> srcs=getImgSrc(htmlStr);
//  System.out.println("--"+htmls.size()+"=="+srcs.size());
//
//  for (String s : htmls) {
//  System.out.println("----"+s);
//  System.out.placeFirst(s, "[图⼀]"));
//  }
//  for (String s : srcs) {
//  System.out.println("==="+s);
//  }
}
/**
* 过滤⼀下字符串,连同前后< xxx >yyy< / xxx >全部消除。
* 不区分⼤⼩写、空格可识别
* <br>"function", "window\\.", "javascript:", "script",replaceall()
* <br>"js:", "about:", "file:", "document\\.", "vbs:", "frame",
* <br>"cookie", "onclick", "onfinish", "onmouse", "onexit=",
* <br>"onerror", "onclick", "onkey", "onload", "onfocus", "onblur"  * @param htmlStr
* @return
*/
public static String filterSafe(String htmlStr){
Pattern p = null; // 正则表达式
Matcher m = null; // 操作的字符串
StringBuffer tmp = null;
String str = "";
boolean isHave = false;
String[] Rstr = { "meta", "script", "object", "embed" };
if (htmlStr == null || !(htmlStr.length() > 0)) {
return "";
}
str = LowerCase();
for (int i = 0; i < Rstr.length; i++) {
p = Patternpile("<" + Rstr[i] + "(.[^>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "<" + Rstr[i] + ">");
while (m.find()) {
m.appendReplacement(tmp, "<" + Rstr[i] + ">");
}
isHave = true;
}
m.appendTail(tmp);
str = String();
p = Patternpile("</" + Rstr[i] + "(.[^>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "</" + Rstr[i] + ">");
while (m.find()) {
m.appendReplacement(tmp, "</" + Rstr[i] + ">");
}
isHave = true;
}
m.appendTail(tmp);
str = String();
}
/
/ System.out.println(str);
String[] Rstr1 = { "function", "window\\.", "javascript:", "script",
"js:", "about:", "file:", "document\\.", "vbs:", "frame",
"cookie", "onclick", "onfinish", "onmouse", "onexit=",
"onerror", "onclick", "onkey", "onload", "onfocus", "onblur" };
for (int i = 0; i < Rstr1.length; i++) {
p = Patternpile("<([^<>])*" + Rstr1[i] + "([^<>])*>([^<>])*</([^<>])*>");
m = p.matcher(str);
tmp = new StringBuffer();
if (m.find()) {
m.appendReplacement(tmp, "");
while (m.find()) {
m.appendReplacement(tmp, "");
}
isHave = true;
}
m.appendTail(tmp);
str = String();
}
if (isHave) {
htmlStr = str;
}
htmlStr = placeAll("%3C", "<");
htmlStr = placeAll("%3E", ">");
htmlStr = placeAll("%2F", "");
htmlStr = placeAll("&#", "<b>&#</b>");
return htmlStr;
}
/**
* 采⽤jsoup⽩名单⽅式过滤⾮法的html字符。
* 原理:
* 1.⾸先通过⽩名单过滤掉⾮法的html标签,即只允许输出⽩名单内的标签
* 2.对特殊的属性(主要是style)⽤正则过滤,只允许安全的属性值存在
* @param htmlStr 原始的html⽚段(⽤户通过富⽂本编辑器提交的html代码)
* @return 过滤后的安全的html⽚段
*/
public static String cleanSafeHtml(String htmlStr) {
Document doc = Jsoup.parseBodyFragment(htmlStr);
OutputSettings outSet = new OutputSettings();
outSet.prettyPrint(false);
outSet.outline(false);
doc.outputSettings(outSet);
Map<String, String> regexMap = initRegexMap();
if (regexMap != null) {
for (Map.Entry<String,String> Set()){
String key = Key();
Elements els = doc.select(key);
for (Element el:els) {
System.out.println("old el:"+el.toString());
String attribute = key.substring(key.indexOf("[")+1, key.indexOf("]"));
String attributeValue = el.attr(attribute);
Matcher valueMatcher = Value()).matcher(attributeValue);    if (valueMatcher.find()) {
String safeValue = up();
System.out.println("safeValue:"+safeValue);
el.attr(attribute, safeValue);
}
System.out.println("new el:"+el.toString());
}
}
}
Whitelist whitelist = initWhiteList();
String safeString = Jsoup.clean(doc.html(), "", whitelist);
System.out.println("safestring:"+safeString);
return safeString;
//  Elements els = doc.select("[style]");
//  for (Element el:els) {
/
/  System.out.println("old el:"+el.toString());
//  String styleattribute = el.attr("style");
//  Matcher styleMatcher = Patternpile(styleAttributeRegex).matcher(styleattribute);
//  if (styleMatcher.find()) {
//    String safeStyle = up();
//    System.out.println("safeStyle:"+safeStyle);
//    el.attr("style", safeStyle);
//  }
//  System.out.println("new el:"+el.toString());
//  }
//  Whitelist whitelist = laxed();
/
/  whitelist.addAttributes("span", "style");
//  String safeString = Jsoup.clean(doc.html(), "", whitelist);
//  System.out.println("safestring:"+safeString);
//  return safeString;
}
private static Whitelist whitelist = null;
private static Whitelist initWhiteList() {
if (whitelist == null) {
synchronized(new Object()) {
whitelist = new Whitelist();
String jsonString = null;
Resource resource = new ClassPathResource("/f");
File file = null;
InputStream input = null;
Writer output = null;
try {
file = File();
input = new FileInputStream(file);
output = new StringWriter();
jsonString = String();
} catch (IOException e) {
/
/ TODO Auto-generated catch block
e.printStackTrace();
}finally {
if (input != null) {
IOUtils.closeQuietly(input);
}
if (output != null) {
IOUtils.closeQuietly(output);
}
}
JsonConfig config = new JsonConfig();
config.setIgnoreDefaultExcludes(true);//这⾥不设置,会把class属性过滤掉
JSONObject jsonObject = JSONObject.fromObject(jsonString,config);
JSONObject whitelistjson = JSONObject("whiteList");
JSONObject protocolsjson = JSONObject("protocols");
JsonMapper newMapper = new JsonMapper();
Map<String, Map<String, String>> whitelistmap = newMapper.String(), HashMap.class);    Map<String, List<String>> protocolsmap = newMapper.String(), HashMap.class);
for (Map.Entry<String, Map<String, String>> Set()){
String tag = Key();
whitelist.addTags(tag);
for (Map.Entry<String,String> Value().entrySet()){
String attribute = Key();

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。