summaryrefslogtreecommitdiffstats
path: root/poxml/parser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'poxml/parser.cpp')
-rw-r--r--poxml/parser.cpp1008
1 files changed, 1008 insertions, 0 deletions
diff --git a/poxml/parser.cpp b/poxml/parser.cpp
new file mode 100644
index 00000000..c34976bf
--- /dev/null
+++ b/poxml/parser.cpp
@@ -0,0 +1,1008 @@
+// #define POXML_DEBUG
+
+#include "parser.h"
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <qregexp.h>
+
+using namespace std;
+
+static const char *singletags[] = {"beginpage","imagedata", "colspec", "spanspec",
+ "anchor", "xref", "area",
+ "footnoteref", "void", "inlinegraphic",
+ "glosssee", "graphic", "xi:include",
+ 0};
+static const char *cuttingtags[] = {"bridgehead", "trans_comment", "para", "title", "term",
+ "entry", "contrib", "keyword", "example",
+ "note", "footnote", "caution",
+ "informalexample", "remark", "comment",
+ "imageobject", "varlistentry", "thead",
+ "tbody", "tgroup", "row", "screenshot", "screeninfo",
+ "variablelist", "step", "procedure",
+ "step", "holder", "listitem", "important",
+ "author", "itemizedlist", "orderedlist",
+ "caption", "textobject", "mediaobject",
+ "tip", "glossdef", "inlinemediaobject",
+ "simplelist", "member", "glossentry",
+ "areaspec", "corpauthor", "indexterm",
+ "calloutlist", "callout", "subtitle",
+ "table", "part", "xi:fallback", "primary",
+ "secondary", "chapter", "sect1", "sect2",
+ "figure", "abstract", "sect3", "sect", "sect4",
+ "warning", "preface", "authorgroup", "keywordset",
+ "informaltable", "qandaentry", "question", "answer",
+ "othercredit", "affiliation", "qandaset",
+ "cmdsynopsis", "funcsynopsis", "funcsynopsisinfo" ,
+ "epigraph", "attribution", "glossary", "chapterinfo",
+ "glossdiv", "blockingquote", "simplesect", "section",
+ "qandadiv", "refsect1", "refmeta", "formalpara",
+ "refentry", "refnamediv", "refpurpose", "refentrytitle",
+ "refmiscinfo", "refsect2", "refsect3", "refsect1info",
+ "refsect2info", "refsect3info", "refsection", "refsectioninfo",
+ "refsynopsisdiv", "refsysnopsisdivinfo", "remark",
+ "revdescription", "glossentry", "partinfo",
+ "segmentedlist", "segtitle", "seg", "seglistitem", "screenco",
+ 0};
+static const char *literaltags[] = {"literallayout", "synopsis", "screen",
+ "programlisting", 0};
+
+bool StructureParser::fatalError ( const QXmlParseException &e )
+{
+ cerr << "fatalError " << e.message().latin1() << " " << e.lineNumber() << " "
+ << e.columnNumber() << endl;
+ return false;
+}
+
+bool StructureParser::startDocument()
+{
+ infos_reg = QRegExp("\\s*poxml_line=\"(\\d+)\" poxml_col=\"(\\d+)\"");
+ do_not_split_reg = QRegExp("\\s*condition=\"do-not-split\"");
+ message = "";
+ inside = 0;
+ return true;
+}
+
+bool StructureParser::isCuttingTag(const QString &qName)
+{
+ int index = 0;
+ while (cuttingtags[index]) {
+ if (cuttingtags[index] == qName)
+ return true;
+ index++;
+ }
+ return isLiteralTag(qName);
+}
+
+bool StructureParser::isSingleTag(const QString &qName)
+{
+ int index = 0;
+ while (singletags[index]) {
+ if (singletags[index] == qName)
+ return true;
+ index++;
+ }
+ return false;
+}
+
+bool StructureParser::isLiteralTag(const QString &qName)
+{
+ int index = 0;
+ while (literaltags[index]) {
+ if (literaltags[index] == qName)
+ return true;
+ index++;
+ }
+ return false;
+}
+
+bool StructureParser::skippedEntity ( const QString & name )
+{
+ if (inside)
+ message += QString("&%1;").arg(name);
+ return true;
+}
+
+bool StructureParser::startElement( const QString& , const QString& ,
+ const QString& qName,
+ const QXmlAttributes & attr )
+{
+ QString tname = qName.lower();
+
+ bool first = false;
+
+ if (isCuttingTag(tname)) {
+ if (!inside) {
+ message = QString::null;
+ list.pc.increasePara();
+ startline = locator->lineNumber();
+ startcol = locator->columnNumber();
+ first = true;
+ }
+ inside++;
+ }
+
+ if (inside)
+ {
+ QString tmp = "<" + tname;
+ for (int i = 0; i < attr.length(); i++) {
+ tmp += QString(" %1=\"%2\"").arg(attr.qName(i)).arg(attr.value(i));
+ }
+ tmp += QString(" poxml_line=\"%1\"").arg(locator->lineNumber());
+ tmp += QString(" poxml_col=\"%1\"").arg(locator->columnNumber());
+
+ if (isSingleTag(qName))
+ tmp += "/>";
+ else
+ tmp += ">";
+ message += tmp;
+ if (first)
+ startcol -= message.length();
+ }
+
+ if (tname == "anchor" || tname.left(4) == "sect" || tname == "chapter")
+ if (!attr.value("id").isEmpty()) list.pc.addAnchor(attr.value("id"));
+
+ return true;
+}
+
+bool StructureParser::startCDATA()
+{
+ if ( inside )
+ message += "<![CDATA[";
+ return true;
+}
+
+bool StructureParser::endCDATA()
+{
+ if ( inside )
+ message += "]]>";
+ return true;
+}
+
+bool StructureParser::isClosure(const QString &message)
+{
+ assert(message.at(0) == '<');
+ int endindex = 1;
+ while (!message.at(endindex).isSpace() && message.at(endindex) != '>')
+ endindex++;
+ QString tag = message.mid(1, endindex - 1);
+ return closureTag(message, tag);
+}
+
+bool StructureParser::closureTag(const QString& message, const QString &tag)
+{
+#ifdef POXML_DEBUG
+ qDebug("closureTag %s %s", message.latin1(), tag.latin1());
+#endif
+
+ int inside = 0;
+ uint index = 0;
+ while (true)
+ {
+ int nextclose = message.find(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)), index);
+ int nextstart = message.find(QRegExp(QString::fromLatin1("<%1[>\\s]").arg(tag)), index);
+ // qDebug("finding %d %d %d %d", nextstart, nextclose, index, inside);
+ if (nextclose == -1) {
+#ifdef POXML_DEBUG
+ qDebug("ending on no close anymore %d %d %d %d", (!inside && index >= message.length()), inside, index, message.length());
+#endif
+ return !inside && index >= message.length();
+ }
+ if (nextstart == -1)
+ nextstart = message.length() + 1;
+
+ if (nextstart < nextclose) {
+ inside++;
+ index = nextstart + 1;
+ while (message.at(index) != '>')
+ index++;
+ index++;
+ } else {
+ inside--;
+ index = nextclose + 1;
+ while (message.at(index) != '>')
+ index++;
+ index++;
+ if (!inside) {
+#ifdef POXML_DEBUG
+ qDebug("ending on exit %d", index >= message.length());
+#endif
+ return index >= message.length();
+ }
+ }
+ }
+}
+
+void StructureParser::descape(QString &message)
+{
+ uint index = 0;
+ stripWhiteSpace( message );
+
+ int inside = 0;
+ bool lastws = false;
+
+ while (index < message.length()) {
+ switch (message.at(index).latin1()) {
+ case '\n':
+ case '\t':
+ case '\r':
+ if (!inside)
+ message[index] = ' ';
+ case ' ':
+ if (!inside && lastws)
+ message[index] = '\010';
+ lastws = true;
+ break;
+ case '<': {
+ uint endindex = index+1;
+ while (endindex < message.length() && !message.at(endindex).isSpace() &&
+ message.at(endindex) != '>')
+ endindex++;
+ QString tag = message.mid(index + 1, endindex - index - 1);
+ if (tag.at(0) == '/') {
+ if (isLiteralTag(tag.mid(1)))
+ inside--;
+ } else
+ if (isLiteralTag(tag))
+ inside++;
+ break;
+ }
+ default:
+ lastws = false;
+ }
+
+ index++;
+ }
+ message.replace(QRegExp("\010"), "");
+}
+
+bool StructureParser::formatMessage(MsgBlock &msg) const
+{
+#ifdef POXML_DEBUG
+ qDebug("formatMessage %s", msg.msgid.latin1());
+#endif
+
+ int offset = 0;
+ bool changed = false;
+ bool recurse = true;
+
+ if (msg.msgid.isEmpty())
+ return true;
+
+ for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++);
+ stripWhiteSpace( msg.msgid );
+
+ // removing starting single tags
+ for (int index = 0; singletags[index]; index++)
+ {
+ int slen = strlen(singletags[index]);
+
+ if (msg.msgid.left(slen + 1) == QString::fromLatin1("<%1").arg(singletags[index]) &&
+ !msg.msgid.at( slen + 1 ).isLetterOrNumber() )
+ {
+#ifdef POXML_DEBUG
+ qDebug("removing single tag %s", singletags[index]);
+#endif
+ int strindex = strlen(singletags[index]) + 1;
+ while (msg.msgid.at(strindex) != '>')
+ strindex++;
+ msg.msgid = msg.msgid.mid(strindex + 1);
+ changed = true;
+ offset += strindex + 1;
+ for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ;
+ stripWhiteSpace( msg.msgid );
+ }
+ }
+
+ while (msg.msgid.right(2) == "/>")
+ {
+ int strindex = msg.msgid.length() - 2;
+ while (msg.msgid.at(strindex) != '<')
+ strindex--;
+ msg.msgid = msg.msgid.left(strindex);
+ stripWhiteSpace( msg.msgid ); // only removed space at the end
+ changed = true;
+ }
+
+ for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ;
+ stripWhiteSpace( msg.msgid );
+
+ while (true) {
+ if (msg.msgid.at(0) != '<')
+ break;
+ if (msg.msgid.at(msg.msgid.length() - 1) != '>')
+ break;
+ int strindex = 1;
+ while (msg.msgid.at(strindex) != ' ' && msg.msgid.at(strindex) != '>')
+ strindex++;
+ QString starttag = msg.msgid.mid(1, strindex - 1);
+ int endindex = msg.msgid.length() - 2;
+ while (msg.msgid.at(endindex) != '<' && msg.msgid.at(endindex + 1) != '/')
+ endindex--;
+#ifdef POXML_DEBUG
+ qDebug("endIndex %d", endindex);
+#endif
+ strindex = endindex;
+ QString orig = msg.msgid;
+
+ QString endtag = msg.msgid.mid(endindex + 2, msg.msgid.length() - (endindex + 2) - 1);
+ QString endtag_attr = endtag.mid(endtag.find(' '), endtag.length());
+ endtag.replace(infos_reg, "");
+ if (endtag == starttag) {
+ if (!closureTag(msg.msgid, starttag))
+ break;
+
+ // removing start/end tags
+ msg.msgid = msg.msgid.left(endindex);
+ strindex = 0;
+ while (msg.msgid.at(strindex) != '>')
+ strindex++;
+ QString attr = msg.msgid.left(strindex);
+ msg.msgid = msg.msgid.mid(strindex + 1);
+ offset += strindex + 1;
+ for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ;
+ stripWhiteSpace( msg.msgid );
+ msg.tag = starttag;
+
+ if (infos_reg.search(attr) >= 0) {
+ msg.lines.first().start_line = infos_reg.cap(1).toInt();
+ msg.lines.first().start_col = infos_reg.cap(2).toInt();
+#ifdef POXML_DEBUG
+ qDebug("col %s %s %d", attr.latin1(), msg.msgid.latin1(), msg.lines.first().start_col);
+#endif
+ offset = 0;
+
+ if (infos_reg.search(endtag_attr) >= 0) {
+ msg.lines.first().end_line = infos_reg.cap(1).toInt();
+ msg.lines.first().end_col = infos_reg.cap(2).toInt() + 1;
+ }
+ }
+ if (do_not_split_reg.search(attr) >= 0) {
+ msg.do_not_split = true;
+ break;
+ }
+
+ changed = true;
+ } else
+ break;
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("formatMessage result %s %d %d", msg.msgid.latin1(), changed && recurse, msg.lines.first().start_col);
+#endif
+
+ msg.lines.first().offset += offset;
+ if (msg.do_not_split)
+ recurse = false;
+
+ if (changed && recurse)
+ formatMessage(msg);
+
+ return !recurse; // indicates an abort
+}
+
+MsgList StructureParser::splitMessage(const MsgBlock &mb)
+{
+ MsgList result;
+
+ MsgBlock msg1 = mb;
+ MsgBlock msg2 = mb;
+
+ QString message = mb.msgid;
+
+#ifdef POXML_DEBUG
+ qDebug("splitMessage %s", message.latin1());
+#endif
+
+ if (message.at(0) == '<') {
+ int endindex = 1;
+ while (!message.at(endindex).isSpace() && message.at(endindex) != '>')
+ endindex++;
+ QString tag = message.mid(1, endindex - 1);
+
+ if (closureTag(message, tag))
+ goto error;
+
+ if (isCuttingTag(tag))
+ {
+ // if the message starts with a cutting tag, this tag has to
+ // end in between. We split both messages and format them
+ int strindex = endindex;
+ strindex++;
+
+ int inside = 1;
+ while (true) {
+#ifdef POXML_DEBUG
+ qDebug("inside %s %d", message.mid(strindex, 35).latin1(), inside);
+#endif
+
+ // the exception for poxml_* attributes is made in the closing tag
+ int closing_index = message.find(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)),
+ strindex);
+ int starting_index = message.find(QRegExp(QString::fromLatin1("<%1[\\s>]").arg(tag)),
+ strindex);
+
+#ifdef POXML_DEBUG
+ qDebug("index1 %d %d %d", closing_index, starting_index, strindex);
+#endif
+
+ // when a new start was found, we set the start_index after the next match
+ // (and set strindex to it later - increasing inside)
+ if (starting_index != -1) {
+ starting_index += tag.length() + 1;
+ while (message.at(starting_index) != '>')
+ starting_index++;
+ starting_index++;
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("index %d %d %d", closing_index, starting_index, strindex);
+#endif
+
+ assert(closing_index != -1);
+ closing_index += 3 + tag.length();
+ while (message.at(closing_index - 1) != '>')
+ closing_index++;
+
+ if (starting_index == -1) {
+ strindex = closing_index;
+#ifdef POXML_DEBUG
+ qDebug("set strindex %d", strindex);
+#endif
+ inside--;
+ if (!inside)
+ break;
+ continue;
+ }
+ if (closing_index < starting_index)
+ {
+ strindex = closing_index;
+ inside--;
+ } else {
+ strindex = starting_index;
+ inside++;
+ }
+
+ if (!inside)
+ break;
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("split into %s -AAAAAANNNNNNDDDDDD- %s", message.left(strindex).latin1(), message.mid(strindex).latin1());
+#endif
+ msg1.msgid = message.left(strindex);
+ bool leave = formatMessage(msg1);
+
+ msg2.msgid = message.mid(strindex);
+ msg2.lines.first().offset += strindex;
+ leave = leave & formatMessage(msg2);
+
+ if (msg1.lines.first().end_line > msg2.lines.first().start_line ||
+ (msg1.lines.first().end_line == msg2.lines.first().start_line &&
+ msg1.lines.first().end_col > msg2.lines.first().start_col))
+ {
+ msg2.lines.first().start_line = msg1.lines.first().end_line;
+ msg2.lines.first().start_col = msg1.lines.first().end_col;
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("splited %d-%d(%s) and %d-%d(%s)", msg1.lines.first().end_line,msg1.lines.first().end_col,
+ msg1.msgid.latin1(),
+ msg2.lines.first().start_line,msg2.lines.first().start_col, msg2.msgid.latin1());
+#endif
+
+ if (leave) {
+ result.append(msg1);
+ result.append(msg2);
+ return result;
+ }
+ result = splitMessage(msg1);
+ result += splitMessage(msg2);
+ return result;
+ }
+
+ }
+
+ if (message.at(message.length() - 1 ) == '>')
+ {
+ int endindex = message.length() - 1;
+ while (endindex >= 0 && (message.at(endindex) != '<' || message.at(endindex + 1) != '/'))
+ endindex--;
+ QString tag = message.mid(endindex + 2, message.length() - endindex - 3);
+ if (tag.find(' ') > 0 ) {
+ tag = tag.left(tag.find(' '));
+ }
+#ifdef POXML_DEBUG
+ qDebug("behind tag %s", tag.latin1());
+#endif
+
+ if (isCuttingTag(tag))
+ {
+ // if the message ends with a cutting tag, this tag has to
+ // start in between. We split both messages and format them
+ int strindex = endindex;
+
+ int inside = 1;
+ while (true) {
+#ifdef POXML_DEBUG
+ qDebug("inside %s %d", message.mid(strindex, 35).latin1(), inside);
+#endif
+
+ int closing_index = message.findRev(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)),
+ strindex - 1);
+ int starting_index = message.findRev(QRegExp(QString::fromLatin1("<%1[\\s>]").arg(tag)),
+ strindex - 1);
+
+#ifdef POXML_DEBUG
+ qDebug("index1 %d %d %d", closing_index, starting_index, strindex);
+#endif
+
+ if (starting_index == -1) {
+ assert(inside == 1);
+ break;
+ }
+
+ if (closing_index > starting_index)
+ {
+ strindex = closing_index;
+ inside++;
+ } else {
+ strindex = starting_index;
+ inside--;
+ }
+
+ if (!inside)
+ break;
+ }
+
+
+#ifdef POXML_DEBUG
+ qDebug("split2 into \"%s\" -AAAAAANNNNNNNNNDDDDDDDDDDD- \"%s\"", message.left(strindex).latin1(), message.mid(strindex).latin1());
+#endif
+
+ msg1.msgid = message.left(strindex);
+ formatMessage(msg1);
+
+ msg2.msgid = message.mid(strindex);
+ msg2.lines.first().offset += strindex;
+ formatMessage(msg2);
+
+ if (msg1.lines.first().end_line > msg2.lines.first().start_line ||
+ (msg1.lines.first().end_line == msg2.lines.first().start_line &&
+ msg1.lines.first().end_col > msg2.lines.first().start_col))
+ {
+ msg1.lines.first().end_line = msg2.lines.first().start_line;
+ msg1.lines.first().end_col = msg2.lines.first().start_col - 1;
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("splited %d-%d(%s) and %d-%d(%s)", msg1.lines.first().end_line,msg1.lines.first().end_col,
+ msg1.msgid.latin1(),
+ msg2.lines.first().start_line,msg2.lines.first().start_col, msg2.msgid.latin1());
+#endif
+
+ result = splitMessage(msg1);
+ result += splitMessage(msg2);
+
+ return result;
+ }
+ }
+error:
+ result.append(mb);
+ return result;
+}
+
+bool StructureParser::endElement( const QString& , const QString&, const QString& qName)
+{
+ QString tname = qName.lower();
+
+ // qDebug("endElement %s - %s %d", tname.latin1(), message.latin1(), inside);
+
+ if (inside) {
+ if (!isSingleTag(qName)) {
+ message += QString("</%1").arg(tname);
+ message += QString(" poxml_line=\"%1\"").arg(locator->lineNumber());
+ message += QString(" poxml_col=\"%1\"").arg(locator->columnNumber());
+ message += ">";
+ }
+ }
+
+ if (isCuttingTag(tname)) {
+ inside--;
+ if (!inside) {
+ MsgBlock m;
+ descape(message);
+ m.msgid = message;
+
+ BlockInfo bi;
+ bi.start_line = startline;
+ bi.start_col = startcol;
+ bi.end_line = locator->lineNumber();
+ bi.end_col = locator->columnNumber() + 1;
+ bi.offset = m.lines.first().offset;
+ m.lines.append(bi);
+ formatMessage(m);
+
+ MsgList messages = splitMessage(m);
+ for (MsgList::Iterator it = messages.begin();
+ it != messages.end(); ++it)
+ {
+#ifdef POXML_DEBUG
+ qDebug("parser '%s' %d '%s' %d:%d", (*it).msgid.latin1(), (*it).lines.first().offset, message.mid((*it).lines.first().offset, 15).latin1(), (*it).lines.first().start_line, (*it).lines.first().start_col);
+#endif
+ // if the remaining text still starts with a tag, the poxml_ info
+ // is most probably more correct
+ if ((*it).msgid.at(0) == '<' && isClosure((*it).msgid)) {
+ if (infos_reg.search((*it).msgid) >= 0) {
+ (*it).lines.first().start_line = infos_reg.cap(1).toInt();
+ (*it).lines.first().start_col = infos_reg.cap(2).toInt();;
+ (*it).lines.first().offset = 0;
+ }
+ }
+ (*it).msgid.replace(infos_reg, QString::null);
+
+ if (!(*it).msgid.isEmpty())
+ list.append(*it);
+ }
+ }
+ }
+
+ return true;
+}
+
+bool StructureParser::comment ( const QString &c )
+{
+ if (c.left(7) != " TRANS:")
+ return true;
+
+ assert(false);
+ return true;
+}
+
+QString StructureParser::escapeLiterals( const QString &_contents) {
+ QString contents = _contents;
+
+ contents.replace(QRegExp("\n"), "&POXML_LINEFEED;");
+ contents.replace(QRegExp("<"), "&POXML_LT;");
+ contents.replace(QRegExp(">"), "&POXML_GT;");
+ contents.replace(QRegExp("\t"), " ");
+ contents.replace(QRegExp(" "), "&POXML_SPACE;");
+
+ return contents;
+}
+
+QString StructureParser::descapeLiterals( const QString &_contents) {
+ QString contents = _contents;
+
+ contents.replace(QRegExp("&POXML_LINEFEED;"), "\n");
+ contents.replace(QRegExp("&POXML_LT;"), "<");
+ contents.replace(QRegExp("&POXML_GT;"), ">");
+ contents.replace(QRegExp("&POXML_SPACE;"), " ");
+ contents.replace(QRegExp("!POXML_AMP!"), "&");
+ return contents;
+}
+
+void StructureParser::stripWhiteSpace( QString &contents)
+{
+ contents = contents.stripWhiteSpace();
+ bool changed;
+ do {
+ changed = false;
+ if (contents.startsWith("&POXML_LINEFEED;")) {
+ contents = contents.mid(strlen("&POXML_LINEFEED;"), contents.length());
+ changed = true;
+ }
+ if (contents.startsWith("&POXML_SPACE;")) {
+ contents = contents.mid(strlen("&POXML_SPACE;"), contents.length());
+ changed = true;
+ }
+ if (contents.endsWith("&POXML_LINEFEED;")) {
+ contents = contents.left(contents.length() - strlen("&POXML_LINEFEED;"));
+ changed = true;
+ }
+ if (contents.endsWith("&POXML_SPACE;")) {
+ contents = contents.left( contents.length() - strlen("&POXML_SPACE;"));
+ changed = true;
+ }
+ } while (changed);
+}
+
+void StructureParser::cleanupTags( QString &contents )
+{
+ contents.replace(QRegExp("&"), "!POXML_AMP!");
+
+ for (int index = 0; literaltags[index]; index++) {
+ QRegExp start(QString("<%1[\\s>]").arg(literaltags[index]));
+ QRegExp end(QString("</%1[\\s>]").arg(literaltags[index]));
+ int strindex = 0;
+ while (true) {
+ strindex = contents.find(start, strindex);
+ if (strindex < 0)
+ break;
+ while (contents.at(strindex) != '>')
+ strindex++;
+ strindex++; // one more
+ int endindex = contents.find(end, strindex);
+ QString part = contents.mid(strindex, endindex - strindex);
+ QString newpart = escapeLiterals(part);
+ contents.replace(strindex, part.length(), newpart);
+ // this assumes that literal tags to not overlap
+ strindex = strindex + newpart.length();
+ }
+ }
+
+ QRegExp unclosed("</(\\w*)\\s\\s*>");
+ int index = -1;
+ while (true) {
+ index = unclosed.search(contents, index + 1);
+ if (index < 0)
+ break;
+ QString tag = unclosed.cap(1);
+ contents.replace(index, unclosed.matchedLength(), QString("</%1>").arg(tag));
+ }
+
+ QRegExp start("<((\\s*[^<>\\s])*)\\s\\s*(/*)>");
+ start.setMinimal(true);
+
+ index = -1;
+ while (true) {
+ index = start.search(contents, index + 1);
+ if (index < 0)
+ break;
+ QString tag = start.cap(1);
+ QString cut = start.capturedTexts().last();
+ // qDebug("UNCLO %s %d -%s- -%s-", start.cap(0).latin1(), index, tag.latin1(), cut.latin1());
+ contents.replace(index, start.matchedLength(), QString("<%1%2>").arg(tag).arg(cut));
+ }
+ QRegExp singletag("<(\\w*)\\s([^><]*)/>");
+
+ index = -1;
+ while (true) {
+ index = singletag.search(contents, index + 1);
+ if (index < 0)
+ break;
+ QString tag = singletag.cap(1);
+ if (!StructureParser::isSingleTag(tag)) {
+ contents.replace(index, singletag.matchedLength(), QString("<%1 %2></%3>").arg(tag).arg(singletag.cap(2)).arg(tag));
+ }
+ }
+
+ QRegExp trans_comment("<!-- TRANS:([^<>]*)-->");
+ index = -1;
+ while (true) {
+ index = trans_comment.search(contents, index + 1);
+ if (index < 0)
+ break;
+ QString msgid = trans_comment.cap(1);
+ contents.replace(index, trans_comment.matchedLength(), QString("<trans_comment>%1</trans_comment>").arg(msgid));
+ }
+
+#ifdef POXML_DEBUG
+ qDebug("final %s", contents.latin1());
+#endif
+
+}
+
+static bool removeEmptyTag( QString &contents, const QString & tag)
+{
+// qDebug("cont %s %s", contents.latin1(), tag.latin1());
+
+ QRegExp empty(QString("<%1[^>]*>[\\s\n][\\s\n]*</%2\\s*>").arg(tag).arg(tag));
+ int strindex = 0;
+ while (true) {
+ strindex = contents.find(empty, strindex);
+ if (strindex < 0)
+ break;
+ qDebug("found empty tag %s", tag.latin1());
+ contents.replace(strindex, empty.matchedLength(), " ");
+ strindex++;
+ return true;
+ }
+ return false;
+}
+
+void StructureParser::removeEmptyTags( QString &contents )
+{
+ bool removed;
+ do {
+ removed = false;
+
+ for (int index = 0; cuttingtags[index]; index++) {
+ if (removeEmptyTag(contents, cuttingtags[index])) {
+ removed = true;
+ break;
+ }
+ }
+ // as glossterm has two different semantics, it's likely
+ // to break something when it's cuttingtag
+ if (removeEmptyTag(contents, "glossterm"))
+ removed = true;
+
+ } while (removed);
+}
+
+bool StructureParser::characters(const QString &ch)
+{
+ if (inside && !ch.isEmpty())
+ message += ch;
+ return true;
+}
+
+QString escape(QString message)
+{
+ message.replace(QRegExp("\\\\"), "\\\\");
+ message.replace(QRegExp("\""), "\\\"");
+ return message;
+}
+
+void outputMsg(const char *prefix, const QString &message)
+{
+ QStringList list = QStringList::split('\n', message, true);
+ QString line;
+
+ if (list.count() == 1) {
+ line = list.first();
+ if (line.isEmpty())
+ cout << prefix << " \"\"\n";
+ else
+ cout << prefix << " \"" << escape(line).utf8().data() << "\"\n";
+ } else {
+ cout << prefix << " \"\"\n";
+ for (QStringList::ConstIterator it = list.begin(); it != list.end(); it++) {
+ line = *it;
+ if (!line.isEmpty()) {
+ cout << " \"" << escape(line).utf8().data();
+ if (it == list.fromLast())
+ cout << "\"\n";
+ else
+ cout << "\\n\"\n";
+ } else {
+ cout << " \"";
+ if (it != list.fromLast())
+ cout << "\\n";
+ cout << "\"\n";
+ }
+ }
+ }
+}
+
+QString escapePO(QString msgid)
+{
+ int index = 0;
+ while (true) {
+ index = msgid.find("\\n", index);
+ if (index == -1)
+ break;
+ if (index >= 1 && msgid.at(index - 1) == '\\' && msgid.at(index - 2) != '\\') {
+ msgid.replace(index - 1, 3, "&POXML_LITERALLINEFEED;");
+ index += 3;
+ } else
+ msgid.replace(index, 2, "\n");
+ }
+ index = 0;
+ while (true) {
+ index = msgid.find("\\\"", index);
+ if (index == -1)
+ break;
+ if (index > 1 && msgid.at(index - 1) == '\\' && msgid.at(index - 2) != '\\')
+ msgid.replace(index - 1, 3, "&POXML_LITERALQUOTE;");
+ else
+ msgid.replace(index, 2, "\"");
+ }
+ index = 0;
+ while (true) {
+ index = msgid.find("\\t", index);
+ if (index == -1)
+ break;
+ if (msgid.at(index - 1) == '\\')
+ msgid.replace(index - 1, 3, "\\t");
+ else
+ msgid.replace(index, 2, "\t");
+ }
+ index = 0;
+ while (true) {
+ index = msgid.find("\\\\", index);
+ if (index == -1)
+ break;
+ msgid.replace(index, 2, "\\");
+ index += 1;
+ }
+
+ msgid.replace(QRegExp("&POXML_LITERALLINEFEED;"), "\\n");
+ msgid.replace(QRegExp("&POXML_LITERALQUOTE;"), "\\");
+ return msgid;
+}
+
+
+MsgList parseXML(const char *filename)
+{
+ StructureParser handler;
+ QFile xmlFile( filename );
+ xmlFile.open(IO_ReadOnly);
+
+ QCString ccontents;
+ ccontents.fill(0, xmlFile.size() + 1);
+ memcpy(ccontents.data(), xmlFile.readAll().data(), xmlFile.size());
+ xmlFile.close();
+
+ QString contents = QString::fromUtf8( ccontents );
+ StructureParser::cleanupTags(contents);
+
+ while (true) {
+ int index = contents.find("<!ENTITY");
+ if (index < 0)
+ break;
+ int inside = 0;
+ int endindex = index + 1;
+ QString replacement = "";
+ while (contents.at(endindex) != '>' || inside)
+ {
+ switch (contents.at(endindex).latin1()) {
+ case '<':
+ inside++; break;
+ case '>':
+ inside--; break;
+ case '\n':
+ replacement += '\n';
+ break;
+ default:
+ break;
+ }
+ endindex++;
+ }
+ endindex++;
+ contents.replace(index, endindex - index, replacement);
+ }
+
+ QTextStream ts(contents.utf8(), IO_ReadOnly);
+ QXmlInputSource source( ts );
+ QXmlSimpleReader reader;
+ reader.setFeature( "http://trolltech.com/xml/features/report-start-end-entity", true);
+ reader.setContentHandler( &handler );
+ reader.setLexicalHandler( &handler );
+ reader.setDTDHandler( &handler );
+ // reader.setErrorHandler( &handler );
+ reader.parse( source );
+ MsgList english = handler.getList();
+
+ bool changed = false;
+
+ do {
+ changed = false;
+ QMap<QString, QString> msgids;
+
+ for (MsgList::Iterator it = english.begin();
+ it != english.end(); it++)
+ {
+ QMap<QString,QString>::Iterator found = msgids.find((*it).msgid);
+ if ((*it).msgid.length() < 4) {
+ (*it).msgid = QString("<%1>").arg((*it).tag) + (*it).msgid +
+ QString("</%1>").arg((*it).tag);
+ changed = true;
+ break;
+ }
+ if (found != msgids.end()) {
+ if (found.data() != (*it).tag) {
+#ifdef POXML_DEBUG
+ qDebug("same msgid for '%s' and '%s'", found.data().latin1(), (*it).tag.latin1());
+#endif
+ changed = true;
+ QString msgid = (*it).msgid;
+ for (MsgList::Iterator it2 = english.begin();
+ it2 != english.end(); it2++)
+ {
+ if ((*it2).msgid == msgid)
+ (*it2).msgid = QString("<%1>").arg((*it2).tag) + msgid + QString("</%1>").arg((*it2).tag);
+ }
+ break;
+ }
+ } else {
+ msgids.insert((*it).msgid, (*it).tag);
+ }
+ }
+ } while (changed);
+
+ return english;
+}
+