001    /*
002     * Copyright (c) 2005 Marko Karppinen & Co. LLC
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.karppinen.xml;
024    
025    import gnu.xml.pipeline.EventConsumer;
026    import gnu.xml.pipeline.EventFilter;
027    
028    import org.xml.sax.SAXException;
029    
030    /**
031     * This filter replaces astral characters with U+FFFD REPLACEMENT CHARACTER. 
032     * This filter is useful when preparing data for a recipient that does not 
033     * handle astral characters gracefully.
034     * 
035     * @version $Id: AstralContentFilter.java,v 1.1 2006/10/30 19:57:09 hsivonen Exp $
036     * @author hsivonen
037     */
038    public class AstralContentFilter extends EventFilter {
039    
040        private static final char[] REPLACEMENT_CHARACTER = {'\uFFFD'};
041        
042        /**
043         * Constructs a new <code>AstralContentFilter</code>.
044         */
045        public AstralContentFilter() {
046            super();
047            setContentHandler(this);
048        }
049    
050        /**
051         * Constructs a new <code>AstralContentFilter</code> chaining it to an 
052         * <code>EventConsumer</code>.
053         * @param consumer the next <code>EventConsumer</code> in the chain
054         */
055        public AstralContentFilter(EventConsumer consumer) {
056            super(consumer);
057            setContentHandler(this);
058        }
059        
060        /**
061         * @see org.xml.sax.ContentHandler#characters(char[], int, int)
062         */
063        public void characters(char[] ch, int start, int length)
064                throws SAXException {
065            int s = start;
066            int i = start;
067            int end = start + length;
068            while(i < end) {
069                char c = ch[i];
070                if('\uD800' <= c && c <= '\uDFFF') {
071                    // Found a surrogate. Flush.
072                    if(s < i) {
073                        super.characters(ch, s, i - s);                    
074                    }
075                    // Emit replacement char once per pair
076                    if('\uD800' <= c && c <= '\uDBFF') {
077                        super.characters(REPLACEMENT_CHARACTER, 0, 1);                                        
078                    }
079                    s = i + 1;
080                }
081                i++;
082            }
083            // Flush.
084            if(s < i) {
085                super.characters(ch, s, i - s);                    
086            }
087        }
088    }