CWMcleaner.java

/**
 * 
 */
package edu.odu.cs.cowem.documents;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * The pegDown processor gets confused by passthrough HTML display elements
 * (though inline elements are OK)., often adding bogus <p> elements
 * around the opening and/or closing tag, resulting in bad XML.
 * 
 * As a workaround, the macro processor can be
 * used to insert <cwm tag='tagname' ... /> empty elements. Here we change
 * these to <tagname ... >, after stripping away any immediately adjacent
 * <p> and </p> tags.
 *  
 * @author Zeil
 */
public class CWMcleaner implements TextSubstitutions {
    
    /**
     * Regular expression used to detect cwm tags.
     */
    private Pattern cwmPattern;
    
 
    /**
     * Create a cleaner.
     */
    public CWMcleaner() {
        cwmPattern = Pattern.compile("(<p> *)?[<]cwm ([^>]*)/[>]( *</p>)?");
    }
    
    
    /**
     * Apply substitutions.
     * 
     * @param target  original string
     * @return original after applying substitutions.
     */
    @Override
    public final String apply(final String target) {
        final String tagAttrStart = "tag=";
        StringBuilder buffer = new StringBuilder();
        Matcher matcher = cwmPattern.matcher(target);
        int start = 0;
        //int counter = 0;
        //System.err.println("*********");
        //System.err.println(target);
        //System.err.println("*********");
        
        while (matcher.find()) {
            //++counter;
            int cwmStart = matcher.start();
            int cwmStop = matcher.end();
            String content = matcher.group(2);

            String[] attributePairs = content.split("  *");
            
            buffer.append(target.substring(start, cwmStart));
            
            String tag = "";
            for (String pair: attributePairs) {
                                
                if (pair.startsWith(tagAttrStart)) {
                    tag = pair.substring(tagAttrStart.length() + 1,  
                            pair.length() - 1);
                    break;
                }
            }
            
            buffer.append('<');
            buffer.append(tag);
            
            if (!tag.startsWith("/")) {
                for (String pair: attributePairs) {
                    if (!pair.startsWith(tagAttrStart)) {
                        buffer.append(' ');
                        buffer.append(pair);
                    }
                }
            }
            buffer.append('>');
            
            start = cwmStop;
        }
        buffer.append(target.substring(start));
        //System.err.println("***CWM substitution");
        //System.err.println(buffer.toString());
        //System.err.println("*********");
        
        return buffer.toString();
    }

}