001 /* 002 * Copyright (c) 2005 Marko Karppinen & Co. LLC 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.karppinen.xml; 024 025 import java.util.regex.Matcher; 026 import java.util.regex.Pattern; 027 028 /** 029 * @version $Id: CharacterUtil.java,v 1.1 2005/05/07 11:09:35 hsivonen Exp $ 030 * @author hsivonen 031 */ 032 public class CharacterUtil { 033 034 private final static Pattern MINIMAL = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD]"); 035 036 // FIXME include UTF-16 representations of U+?FFFE and U+?FFFF. 037 private final static Pattern PRUDENT = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD]|\\uFEFF|[\\x7F-\\x84]|[\\x86-\\x9F]|[\\uFDD0-\\uFDDF]"); 038 039 public static String scrubCharacterData(CharSequence data) { 040 Matcher m = MINIMAL.matcher(data); 041 return m.replaceAll(""); 042 } 043 public static String prudentlyScrubCharacterData(CharSequence data) { 044 Matcher m = PRUDENT.matcher(data); 045 return m.replaceAll(""); 046 } 047 }