interchange/dpart_to_xml
Transform a PDF/VT DPart tree to XML.
Download Java Code Show Output Show Input (starter_pdfvt1.pdf)
/*
* pCOS sample application for dumping the "Document Part Hierarchy" of a PDF/VT
* document as XML as specified in Annex D of ISO 16612-2:2010. If no Document
* Part Hierarchy is present, an XML file with an empty "<PDFVT>" element is
* created.
*
* In order to generate well-formed XML a Java-specific method for generating
* the XML output is used. When porting this code to other programming
* languages, this must be replaced accordingly.
*
* The code to dump a DPM dictionary is not protected against incorrect endless
* recursive dictionaries. A check for this should be implemented in
* production code.
*
* Required software: pCOS interface 8 (PDFlib+PDI/PPS 9, TET 4.1, PLOP 5.0)
* Required data: PDF/VT or PDF 2.0 document with DParts
*/
package com.pdflib.cookbook.pcos.interchange;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.Locale;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.xml.sax.helpers.AttributesImpl;
import com.pdflib.IpCOS;
import com.pdflib.cookbook.pcos.pcos_cookbook_example;
public class dpart_to_xml extends pcos_cookbook_example {
/**
* Max. length of a sub-array in the DParts array of a DPart dictionary.
*/
private static final int DPARTS_MAX_LENGTH = 8192;
/* This is where the data files are. Adjust as necessary. */
private final static String SEARCH_PATH = "../input";
public void example_code(IpCOS p, int doc) throws Exception {
/* Open the XML output document and set up an XML serialization to it */
String filename = p.pcos_get_string(doc, "filename");
File input_file = new File(filename);
String basename = input_file.getName();
String xml_name = basename + ".dpart.xml";
Writer xml = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(xml_name), "UTF-8"));
System.out.println("Writing Document Part Hierarchy to file "
+ xml_name);
StreamResult xml_result = new StreamResult(xml);
SAXTransformerFactory transformer_factory =
(SAXTransformerFactory) SAXTransformerFactory.newInstance();
transformer_factory.setAttribute("indent-number", Integer.valueOf(2));
TransformerHandler handler =
transformer_factory.newTransformerHandler();
Transformer serializer = handler.getTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.METHOD, "xml");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(xml_result);
handler.startDocument();
AttributesImpl attributes = new AttributesImpl();
/*
* Put some information about the input document into an XML comment
*/
String comment = "\nInput PDF document: " + p.pcos_get_string(doc, "filename")
+ "\nPDF version: " + p.pcos_get_string(doc, "pdfversionstring")
+ "\nPDF/VT version: " + p.pcos_get_string(doc, "pdfvt")
+ "\n";
handler.comment(comment.toCharArray(), 0, comment.length());
/*
* Fixed XML root element of the document.
*/
handler.startElement("", "", "PDFVT", attributes);
String objtype = p.pcos_get_string(doc, "type:/Root/DPartRoot");
if (objtype.equals("dict")) {
/*
* Retrieve the required list of node names.
*/
objtype = p.pcos_get_string(doc,
"type:/Root/DPartRoot/NodeNameList");
if (!objtype.equals("array")) {
throw new Exception(
"Required entry \"NodeNameList\" is missing in DPartRoot dictionary or has wrong type");
}
int num_names = (int) p.pcos_get_number(doc,
"length:/Root/DPartRoot/NodeNameList");
String node_names[] = new String[num_names];
for (int i = 0; i < num_names; i += 1) {
node_names[i] = p.pcos_get_string(doc,
"/Root/DPartRoot/NodeNameList[" + i + "]");
}
objtype = p.pcos_get_string(doc,
"type:/Root/DPartRoot/DPartRootNode");
if (!objtype.equals("dict")) {
throw new Exception(
"Required entry \"DPartRootNode\" is missing in DPartRoot dictionary or has wrong type");
}
/*
* Invoke the recursive method that dumps the document part
* hierarchy
*/
dump_dpart_node(p, doc, handler, 0, 0, node_names,
"/Root/DPartRoot/DPartRootNode");
}
else {
throw new Exception(
"DPartRoot dictionary missing from Catalog or has wrong type");
}
handler.endElement("", "", "PDFVT");
xml.close();
}
/**
* Recursively transform a DPart node to XML.
*
* @param p
* IpCOS object
* @param doc
* document handle
* @param handler
* XML serialization handler
* @param level
* level in DPart tree (zero-based)
* @param start_page
* zero-based number of start page of DPart subtree
* @param node_names
* array containing the names for the levels of the DPart tree
* @param dpart_node_path
* pCOS path of the DPart node
* @return number of pages covered by the DPart subtree
*
* @throws Exception
* a pCOS error occurred
* @throws Exception
* a non-recoverable error was discovered in the DPart tree
*/
private int dump_dpart_node(IpCOS p, int doc, TransformerHandler handler,
int level, int start_page, String node_names[], String dpart_node_path)
throws Exception {
/*
* Consistency check: The DPart tree may not be deeper than the length
* of the NodeNameList array, as for every level of the tree the
* corresponding entry from the NodeNameList array is used to name the
* XML element
*/
if (level >= node_names.length) {
throw new Exception(
"Document part hierarchy is deeper than allowed by the length of the NodeNameList array in the DPartRoot dictionary");
}
/*
* XML attributes are not used in the DPart-to-XML conversion, but an
* empty AttributesImpl object must be provided nevertheless.
*/
AttributesImpl attributes = new AttributesImpl();
handler.startElement("", "", node_names[level], attributes);
/*
* Transform the document part metadata to XML if present.
*/
String dpm_path = dpart_node_path + "/DPM";
String objtype = p.pcos_get_string(doc, "type:" + dpm_path);
if (objtype.equals("dict")) {
handler.startElement("", "", "DPM", attributes);
dump_dpm(p, doc, handler, dpm_path);
handler.endElement("", "", "DPM");
}
else if (!objtype.equals("null")) {
System.err.println("Warning: Unexpected type \"" + objtype
+ "\" for DPM entry, skipping it (pCOS Path \"" + dpart_node_path + "\")");
}
/*
* Retrieve information whether this is an inner node of the tree or
* a leaf.
*/
String dparts_array = dpart_node_path + "/DParts";
String dparts_objtype = p.pcos_get_string(doc, "type:" + dparts_array);
String start_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path
+ "/Start");
String end_objtype = p.pcos_get_string(doc, "type:" + dpart_node_path
+ "/End");
if (!dparts_objtype.equals("null") && !start_objtype.equals("null")) {
throw new Exception(
"DPart dictionary contains both a DParts and a Start key (pCOS Path \""
+ dpart_node_path + "\")");
}
/* page count for subtree */
int page_count = 0;
if (!dparts_objtype.equals("null")) {
if (dparts_objtype.equals("array")) {
/*
* This is an array of arrays, where each sub-array has a
* maximum of 8192 entries.
*/
int dparts_length = (int) p.pcos_get_number(doc, "length:" + dparts_array);
for (int i = 0; i < dparts_length; i += 1) {
String dparts_array_entry = dparts_array + "[" + i + "]";
objtype = p.pcos_get_string(doc, "type:" + dparts_array_entry);
if (!objtype.equals("array")) {
throw new Exception(
"DParts array entry has wrong type \"" + objtype
+ "\" (pCOS path \"" + dparts_array_entry + "\")");
}
/*
* Check that every sub-array but the last one has exactly
* 8192 entries.
*/
int dparts_entry_length = (int) p.pcos_get_number(doc,
"length:" + dparts_array_entry);
if (i < dparts_length - 1 && dparts_entry_length != DPARTS_MAX_LENGTH) {
System.err.println("Warning: DParts sub-array has " + dparts_entry_length
+ " entries, should be 8192 (pCOS path \"" + dparts_array_entry + "\")");
}
else if (dparts_entry_length > DPARTS_MAX_LENGTH) {
System.err.println("Warning: DParts sub-array has more than 8192 entries (pCOS path \"" + dparts_array_entry + "\")");
}
else if (dparts_entry_length == 0) {
System.err.println("Warning: Empty DParts sub-array (pCOS path \"" + dparts_array_entry + "\")");
}
/*
* Recursively transform all subtrees to XML.
*/
for (int j = 0; j < dparts_entry_length; j += 1) {
String dpart_child_path = dparts_array_entry + "[" + j + "]";
page_count += dump_dpart_node(p, doc, handler, level + 1, start_page + page_count,
node_names, dpart_child_path);
}
}
}
else {
throw new Exception("DParts entry has wrong type \"" + objtype
+ "\" (pCOS path \"" + dparts_array + "\")");
}
}
else {
/*
* Enumerate the pages that belong to the document part. Here we
* have only a reference to the first and, if there is more than one
* page in the document part, to the last page. To avoid a
* complicated walk through the PDF Page tree, we make use of the
* the fact that the order of the page objects as defined by the
* page tree is the same order in which Page objects are referenced
* from leaf node DPart dictionaries in a depth-first traversal. By
* maintaining a counter for the traversed pages we can use the pCOS
* pseudo-object pages array.
*/
if (!start_objtype.equals("dict")) {
throw new Exception("Start entry has wrong type \"" + start_objtype
+ "\" (pCOS path \"" + dpart_node_path + "/Start\")");
}
int start_id = (int) p.pcos_get_number(doc, "pcosid:"
+ dpart_node_path + "/Start");
int end_id;
if (end_objtype.equals("null")) {
end_id = start_id;
}
else {
if (!end_objtype.equals("dict")) {
throw new Exception("End entry has wrong type \"" + objtype
+ "\" (pCOS path \"" + dpart_node_path + "/End\")");
}
end_id = (int) p.pcos_get_number(doc, "pcosid:"
+ dpart_node_path + "/End");
if (end_id == start_id) {
System.err.println("Warning: End entry present but points to the same page as Start entry (pCOS path \""
+ dpart_node_path + "/End\")");
}
}
/*
* We cross-check that the traversal does not diverge from the
* actual page order by comparing the object ids of the first and
* the last page.
*/
if (start_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + start_page + "]")) {
throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to order of Page objects");
}
/*
* There is at least one page in the range. For each page produce
* an empty <PDFPage/> element.
*/
while (start_id != end_id) {
page_count += 1;
start_id = (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count) + "]");
handler.startElement("", "", "PDFPage", attributes);
handler.endElement("", "", "PDFPage");
}
page_count += 1;
handler.startElement("", "", "PDFPage", attributes);
handler.endElement("", "", "PDFPage");
if (end_id != (int) p.pcos_get_number(doc, "pcosid:pages[" + (start_page + page_count - 1) + "]")) {
throw new Exception("Sequence of pages retrieved by depth-first traversal of document part tree does not map to sequence of Page objects");
}
}
handler.endElement("", "", node_names[level]);
return page_count;
}
/**
* Recursively transform an entry in the document part metadata to XML
* according to Annex D of ISO 16612-2:2010.
*
* @param p
* IpCOS object
* @param doc
* document handle
* @param handler
* XML serialization handler
* @param dpm_path
* pCOS path of the DPM entry
* @throws Exception
* a pCOS error occured
* @throws Exception
* an error occurred in the XML serialization
*/
private void dump_dpm(IpCOS p, int doc,
TransformerHandler handler, String dpm_path)
throws Exception {
AttributesImpl attributes = new AttributesImpl();
int dict_length = (int) p.pcos_get_number(doc, "length:" + dpm_path);
for (int i = 0; i < dict_length; i += 1) {
String entry_path = dpm_path + "[" + i + "]";
String key = p.pcos_get_string(doc, entry_path + ".key");
String xml_key = key.replace(':', '_');
handler.startElement("", "", xml_key, attributes);
dump_dpm_entry(p, doc, handler, entry_path + ".val");
handler.endElement("", "", xml_key);
}
}
/**
* Transform a single entry in the DPM dictionary to XML.
*
* @param p
* IpCOS object
* @param doc
* document handle
* @param handler
* XML serialization handler
* @param entry_path
* pCOS path of the entry
* @throws Exception
* a pCOS error occured
* @throws Exception
* an error occurred in the XML serialization
*/
private void dump_dpm_entry(IpCOS p, int doc, TransformerHandler handler,
String value_path) throws Exception {
String objtype = p.pcos_get_string(doc, "type:" + value_path);
if (objtype.equals("string") || objtype.equals("name")) {
String value = p.pcos_get_string(doc, value_path);
handler.characters(value.toCharArray(), 0, value.length());
}
else if (objtype.equals("array")) {
int array_length = (int) p.pcos_get_number(doc, "length:" + value_path);
AttributesImpl a = new AttributesImpl();
for (int j = 0; j < array_length; j += 1) {
handler.startElement("", "", "Item", a);
dump_dpm_entry(p, doc, handler, value_path + "[" + j + "]");
handler.endElement("", "", "Item");
}
}
else if (objtype.equals("dict") || objtype.equals("stream") || objtype.equals("fstream")) {
dump_dpm(p, doc, handler, value_path);
}
else if (objtype.equals("number")) {
NumberFormat f = NumberFormat.getInstance(Locale.US);
String value = f.format(p.pcos_get_number(doc, value_path));
handler.characters(value.toCharArray(), 0, value.length());
}
else if (objtype.equals("boolean")) {
String value =
(int) p.pcos_get_number(doc, value_path) != 0 ?
"true" : "false";
handler.characters(value.toCharArray(), 0, value.length());
}
}
public dpart_to_xml(String[] argv, String readable_name,
String search_path) {
super(argv, readable_name, search_path);
}
public static void main(String argv[]) {
dpart_to_xml example = new dpart_to_xml(argv,
"Document Part Hierarchy", SEARCH_PATH);
example.execute();
}
}