Powered By Blogger

Monday, September 12, 2011

Neko HTML Fragment Parser with Xml Transform

import java.io.ByteArrayInputStream;
import java.io.StringWriter;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.xpath.XPathAPI;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;

/**
*Working with html fragements and tranforming node result
*to xml string
*/
public class HtmlFragmentTest {

String exp = "//TABLE[@class=\"subscribe_form\"]";
String xml = null;


public HtmlFragmentTest() {

StringBuffer sb = new StringBuffer();

sb.append( "<h2>Contributing to the FAQ</h2>\n" + "<p>If you think that you have a FAQ that's not answered here, or if you\n" + "see something that needs a correction/update, please\n" + "<a href=\"/contribute/\">contribute</a>!</p>\n" );

sb.append( "<table class=\"subscribe_form\" cellpadding=\"0\" cellspacing=\"0\"><tr>\n");

sb.append( "<td class=\"label\">\n" );

sb.append( "To get updates by email whenever the FAQ is updated, enter your email"+ "address here and click "Subscribe:"\n" );

sb.append( "</td>\n" + "<td>\n" + "<form method=\"post\" action=\"/notify.php\">\n" );

sb.append( "<div>\n" + "<input type=\"text\" size=\"20\" name=\"email\" />\n" + "<input type=\"submit\" value=\"Subscribe\" />\n" + "</div>\n" );

sb.append( "</form>\n" + "</td>\n" + "</tr></table>");

xml = sb.toString();

}

public void test() {

HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment doc;
try {

DOMFragmentParser parser = new DOMFragmentParser( );

//parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", true);
parser.setFeature ( "http://xml.org/sax/features/namespaces", false );
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower" ); // has no effect, cannot override xerces configuration
parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "lower" ); // has no effect, cannot override xerces configuration
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",true);

doc = document.createDocumentFragment();
InputSource inputSource = new InputSource( new ByteArrayInputStream( xml.getBytes() ) );

parser.parse(inputSource, doc);

Node node = XPathAPI.selectSingleNode(doc, exp);
this.xml = transform( node );
System.out.println( "---------------done once --------------");

} catch(Exception ex) {
ex.printStackTrace();
//return null;
}
}

public static void main( String[] args ) {
HtmlFragmentTest test = new HtmlFragmentTest();
test.test();
test.test();
}

/**
*
* @param node
* @return
* @throws TransformerException
*/
public static String transform( Node node ) throws TransformerException {
StringWriter sw = new StringWriter();
Transformer serializer = TransformerFactory.newInstance().newTransformer();
serializer.transform( new DOMSource( node ), new StreamResult(sw));
String result = sw.toString();
System.out.println( result );
return result;
}

}