001    /*
002     * Copyright (c) 2005 Marko Karppinen & Co. LLC
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.karppinen.xml;
024    
025    import java.util.regex.Matcher;
026    import java.util.regex.Pattern;
027    
028    /**
029     * @version $Id: CharacterUtil.java,v 1.1 2005/05/07 11:09:35 hsivonen Exp $
030     * @author hsivonen
031     */
032    public class CharacterUtil {
033    
034        private final static Pattern MINIMAL = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD]");
035    
036        // FIXME include UTF-16 representations of U+?FFFE and U+?FFFF.
037        private final static Pattern PRUDENT = Pattern.compile("[^\\x09\\x0A\\x0D\\u0020-\\uFFFD]|\\uFEFF|[\\x7F-\\x84]|[\\x86-\\x9F]|[\\uFDD0-\\uFDDF]");
038        
039        public static String scrubCharacterData(CharSequence data) {
040            Matcher m = MINIMAL.matcher(data);
041            return m.replaceAll("");
042        }
043        public static String prudentlyScrubCharacterData(CharSequence data) {
044            Matcher m = PRUDENT.matcher(data);
045            return m.replaceAll("");
046        }    
047    }