#!/usr/bin/env python2
r"""
db2vim [options] file.xml

SHORT OPTIONS

-d  Prints some debugging information on stderr.

-s  If given, the db2vim operates in a 'stict' conversion mode, i.e, any
    element which does not have a handler defined for them it be
    completeley ignored including all its children. Otherwise, db2vim will
    recurse into an unknown tag and process any of its children it
    recognizes. Since db2vim always recognizes text nodes, not using this
    option has the effect that all text will be printed out, even if
    somewhat incorrectly.

LONG OPTIONS

--prefix=<prefix>
    This is a string like "ls_" which will be prepended to the section
    numbers. Default to 'ls_' if unsupplied.
"""


import xml.dom.minidom
import getopt
import string
import re
import sys

# Okay. so I import *. Shoot me.
from textutils import *
from domutils import *

# define a bunch of constants for formatting.
TEXT_WIDTH = 80
BLOCK_QUOTE = 4
COL_SPACE = 2

# a bunch of globals used in creating the Table of contents.
#
# TOC_HASH['section 1.1 label'] = 'ls_1_1'
#
# LEVEL_HASH['section 1.1 label'] = 1
#       (top level article has level 0)
#
# TITLE_HASH['section 1.1 label'] = 'Title of section 1.1'
#
# FILENAME = the name of the file being processed with the last extension
#            changed to .txt
#
# TOC_PREFIX = 'ls_' (the prefix used to create the section labels).
TOC_HASH = {}
LEVEL_HASH = {}
TITLE_HASH = {}
FILENAME = ''
TOC_PREFIX = ''

ANCHOR_HASH = {}
URL_HASH = {}

# STDERR for printing debugging info.
DEBUG = 0
STDERR = sys.stderr
STRICT = 0
NUM_ANCHORS = {0: 1}

###############################################################################
# Miscellaneous utility functions
###############################################################################


def encodeTo52(num):
    if num < 26:
        return unichr(ord('a') + num)
    elif num < 52:
        return unichr(ord('A') + num - 26)
    else:
        return encodeTo52(int(num / 52)) + encodeTo52(num % 52)


def makeTocHash(rootElement, width, prefix='', level=0):
    lastLabelUsed = 0

    for section in rootElement.getChildrenByTagName('section'):
        title = section.getChildrenByTagName('title')[0]
        titleText = handleElement(title, width)
        lastLabelUsed += 1
        thisLabel = TOC_PREFIX + prefix + str(lastLabelUsed)

        sectionid = section.getAttribute('id')
        if not sectionid:
            section.setAttribute('id', thisLabel)
            sectionid = thisLabel

        NUM_ANCHORS[0] += 1
        ANCHOR_HASH[sectionid] = TOC_PREFIX + 'a_' + encodeTo52(
            NUM_ANCHORS[0] + 52)

        TOC_HASH[sectionid] = thisLabel
        LEVEL_HASH[sectionid] = level
        TITLE_HASH[sectionid] = titleText

        if section.getChildrenByTagName('section'):
            makeTocHash(section, width - 5, prefix=prefix +
                        str(lastLabelUsed) + '_', level=level + 1)


def makeAnchorHash(rootElement):
    anchors = rootElement.getElementsByTagName(
        'anchor') + rootElement.getElementsByTagName('note')
    for anchor in anchors:
        if not anchor.getAttribute('id'):
            continue

        NUM_ANCHORS[0] += 1
        if anchor.getAttribute('id') in ANCHOR_HASH or \
                anchor.getAttribute('id') in TOC_HASH:
            sys.stderr.write("Warning: anchor [%s] multiply defined\n" %
                             anchor.getAttribute('id'))

        ANCHOR_HASH[anchor.getAttribute(
            'id')] = TOC_PREFIX + 'a_' + encodeTo52(NUM_ANCHORS[0] + 52)


def makeURLHash(rootElement):
    urls = rootElement.getElementsByTagName('ulink')
    numURLs = 0
    for url in urls:
        if not url.getAttribute('url') or url.getAttribute('url') in URL_HASH:
            continue
        numURLs += 1
        URL_HASH[url.getAttribute('url')] = TOC_PREFIX + 'u_' + str(numURLs)


def makeTOC(node, width, maxlevel=1):
    retText = ""

    for section in node.getChildrenByTagName('section'):

        sectionid = section.getAttribute('id')
        thisLabel = TOC_HASH.get(sectionid, '')
        titleText = TITLE_HASH.get(sectionid, '')
        level = LEVEL_HASH.get(sectionid, 10)

        if level <= maxlevel:
            retText += '|' + thisLabel + '| ' + titleText + '\n'

        if level < maxlevel and section.getChildrenByTagName('section'):
            childText = makeTOC(section, width - 5)
            retText += VertCatString("    ", 4, childText) + '\n'

        retText = re.sub(r'\s+$', r'\n', retText)

    return retText


###############################################################################
# Generalized function for handling dom elements.
###############################################################################


def IsInlineTag(self):
    if self.nodeType == self.TEXT_NODE:
        return 1
    elif inlineTags.get(self.tagName, 0):
        return 1
    else:
        return 0


def getChildrenByTagName(self, name):
    """
        extension to the xml.dom.minidom.Element class.  returns all direct
        descendants of this Element.
    """
    nodeList = []

    child = self.firstChild
    while not child is None:
        if child.nodeType == child.ELEMENT_NODE and child.nodeName == name:
            nodeList.append(child)

        child = child.nextSibling

    return nodeList

xml.dom.minidom.Element.getChildrenByTagName = getChildrenByTagName


def handleElement(rootElement, width=TEXT_WIDTH):
    """
    Generalized function to handle an Element node in a DOM tree.
    """

    retText = ""
    child = rootElement.firstChild
    while not child is None:

        printerr('node type = %d' % child.nodeType)
        if child.nodeType == child.ELEMENT_NODE:
            printerr('processing [%s]' % child.tagName)

        isinline = IsInlineTag(child)

        # if the child is an Element and if a handler exists, then call it.
        if not isinline \
            and child.nodeType == child.ELEMENT_NODE \
                and child.tagName in handlerMaps:
            # offset the child text by the current indentation value
            printerr('making recursive call to known child.')
            retText += handlerMaps[child.tagName](child, width)
            child = child.nextSibling

        elif not isinline \
            and child.nodeType == child.PROCESSING_INSTRUCTION_NODE \
                and child.target == 'vimhelp':

            if child.data in handlerMaps:
                retText += handlerMaps[child.data](child, width)

            child = child.nextSibling

        # if its a text node or an inline element node, collect consecutive
        # text nodes into a single paragraph and indent it.
        elif isinline:

            text = ""
            while not child is None and IsInlineTag(child):
                if child.nodeType == child.TEXT_NODE:
                    text += child.data
                elif child.nodeType == child.ELEMENT_NODE:
                    if child.tagName in handlerMaps:
                        text += handlerMaps[child.tagName](child, width)
                    else:
                        text += GetText(child.childNodes)
                child = child.nextSibling

            retText += IndentParagraphs(text, width)

        # If we cannot understand _anything_ about the element, then just
        # handle its children hoping we have something to gather from
        # there.
        elif not STRICT:
            printerr('making recursive call for unkown child')
            retText += handleElement(child, width)
            child = child.nextSibling

        else:
            child = child.nextSibling

    return retText


###############################################################################
# Functions for handling various xml tags
###############################################################################


def handleArticleInfo(articleinfo, width):

    makeTocHash(articleinfo.parentNode, width)
    makeAnchorHash(articleinfo.parentNode)
    makeURLHash(articleinfo.parentNode)

    title = articleinfo.getChildrenByTagName('title')
    if title is None:
        print("Article should have a title!")
        sys.exit(1)

    name = GetText(title[0].childNodes)
    authors = articleinfo.getChildrenByTagName('author')

    authorText = ''
    for author in authors:
        firstname = ''
        surname = ''
        if author.getElementsByTagName('firstname'):
            firstname = GetTextFromElementNode(author, 'firstname')[0]
        if author.getChildrenByTagName('surname'):
            surname = GetTextFromElementNode(author, 'surname')[0]
        if author.getElementsByTagName('email'):
            email = GetTextFromElementNode(author, 'email')[0]
        authorText = authorText + firstname + ' ' + surname + \
            ' <' + email + '>\n'

    abstractText = ''
    abstract = articleinfo.getChildrenByTagName('abstract')
    if abstract is not None:
        abstractText = '\n\n' + CenterText('Abstract\n========', width)
        abstractText += handleElement(abstract[0], width) + '\n'

    retText = CenterText(name + '\n*' + FILENAME + '*\n' + authorText, width)
    retText += abstractText

    toc = makeTOC(articleinfo.parentNode, width)

    foldwarn = r'''
================================================================================
Viewing this file

This file can be viewed with all the sections and subsections folded to ease
navigation. By default, vim does not fold help documents. To create the folds,
press za now. The folds are created via a foldexpr which can be seen in the
last section of this file.

See |usr_28.txt| for an introduction to folding and |fold-commands| for key
sequences and commands to work with folds.
'''

    return retText + '\n' + RightJustify('*' + FILENAME + '-toc*', width) + \
        '\n' + toc + foldwarn


def handleOption(option, width):
    retText = ""
    names = GetTextFromElementNode(option, "name")

    for name in names:
        retText += string.rjust("*" + name + "*", width) + "\n"

    nameTexts = ""
    maxNameLen = -1
    for name in names:
        maxNameLen = max(maxNameLen, len(name + "    "))
        nameTexts += name + "    \n"

    desc = option.getChildrenByTagName("desc")[0]
    descText = handleElement(desc, width=width - maxNameLen)

    retText += VertCatString(nameTexts + "    ", None, descText)

    return retText + "\n"


def handleOptionDefault(default, width):
    type = string.join(GetTextFromElementNode(default, "type"), "\n")
    extra = string.join(GetTextFromElementNode(default, "extra"), "\n")
    return type + "\t(" + extra + ")"


def handleTableRoot(root, width):
    tgroup = root.getChildrenByTagName('tgroup')[0]
    if tgroup is None:
        return ''

    rows = []
    numHeadRows = 0
    if tgroup.getChildrenByTagName('thead'):
        thead = tgroup.getChildrenByTagName('thead')[0]
        rows = thead.getChildrenByTagName('row')
        numHeadRows = len(rows)

    tbody = tgroup.getChildrenByTagName('tbody')[0]
    rows += tbody.getChildrenByTagName('row')

    widths, text = calculateColumnWidthsDoublePass(rows, width)

    headText = text[0:numHeadRows]
    bodyText = text[numHeadRows:]

    headTable = FormatTable(headText, ROW_SPACE=1, COL_SPACE=
                            COL_SPACE, justify=0, widths=widths)
    if headTable:
        headTable = re.sub(r'\n|$', '\g<0>~', headTable)
    bodyTable = FormatTable(bodyText, ROW_SPACE=1, COL_SPACE=
                            COL_SPACE, justify=0, widths=widths)

    return headTable + '\n' + re.sub(r'\n+$', '', bodyTable) + '\n\n'


def calculateColumnWidths(rows, alloc_widths):
    widths = {}
    text = []
    for row in rows:
        cols = row.getChildrenByTagName("entry")
        if len(alloc_widths) == 1:
            alloc_widths *= len(cols)

        colwidths = []
        rowtext = []
        for col, width in zip(cols, alloc_widths):
            coltext = handleElement(col, width)

            rowtext.append(coltext)
            # This is the 'width' of the current cell including the
            # whitespace padding.
            colwidths.append(max(map(len, coltext.split("\n")))
                             + COL_SPACE)

        text.append(rowtext)

        # update the widths of the columns by finding the maximum
        # width of all cells in this column.
        for i in range(len(colwidths)):
            widths[i] = max(colwidths[i], widths.get(i, -1))

    return widths, text


def calculateColumnWidthsDoublePass(rows, width):
    maxwidths, text = calculateColumnWidths(rows, [width])
    if reduce(lambda x, y: x + y, maxwidths.values()) <= width:
        return maxwidths, text

    # now find out how many columns exceed the maximum permitted width.
    # nlarge: number of columns which are too wide.
    # remainingWidth: width which these large columns can share.
    nlarge = 0
    remainingWidth = width
    for colwidth in maxwidths.values():
        if colwidth > width / len(maxwidths):
            nlarge += 1
        else:
            remainingWidth += -colwidth

    # newmaxwidth: width which each of the large columns is allowed.
    newmaxwidth = remainingWidth / max(nlarge, 1)

    newcolwidths = []
    for colwidth in maxwidths.values():
        newcolwidths += [min(colwidth, newmaxwidth)]

    # make another run and this time ask each cell to restrict itself to
    # newmaxwidth as calculated above.
    newmaxwidth, newtext = calculateColumnWidths(rows, newcolwidths)

    return newmaxwidth, newtext


def handleCode(code, width):
    retText = GetText(code.childNodes)
    return " &codebegin;\n" + VertCatString("    ", 4, retText) + "&codeend;"


def handleList(list, width, marker=0):
    if list.tagName == 'simplelist':
        child = 'member'
        decoration = ''
    elif list.tagName == 'orderedlist':
        child = 'listitem'
    else:
        child = 'member'
        decoration = '- '

    retText = ""
    items = list.getChildrenByTagName(child)
    i = 1

    for item in items:
        if list.tagName == 'orderedlist':
            decoration = str(i) + '. '
            i = i + 1
        itemText = handleElement(item, width - len(decoration))
        itemText = VertCatString(decoration, None, itemText)

        retText += '\n' + re.sub(r'\s+$', '', itemText) + "\n"

    return retText


def handleNote(note, width):
    title = None
    if note.getChildrenByTagName('title'):
        title = note.getChildrenByTagName('title')[0]
        name = GetText(title.childNodes)
        note.removeChild(title)

    noteid = ''
    if note.getAttribute('id'):
        noteTagText = '*' + note.getAttribute('id') + '* '
        noteTagText += '*' + ANCHOR_HASH[note.getAttribute('id')] + '*'
        noteTagText = IndentParagraphs(noteTagText, width / 2)
        noteid = RightJustify(noteTagText, width) + '\n'

    noteText = handleElement(note, width - len("NOTE: "))
    if title is not None:
        noteText = name + '\n' + ('-' * len(name)) + '\n' + noteText

    noteText = noteid + VertCatString("NOTE: ", None, noteText)

    return noteText + "\n"


def handleParagraph(paragraph, width):
    partext = handleElement(paragraph, width)

    partext = re.sub(r'\n+$', '', partext)
    partext = re.sub(r'^\n+', '', partext)

    return partext + "\n\n"


def handleFormalParagraph(formalparagraph, width):
    title = None
    if formalparagraph.getChildrenByTagName('title'):
        title = formalparagraph.getChildrenByTagName('title')[0]
        name = GetText(title.childNodes)
        formalparagraph.removeChild(title)

    partext = handleElement(formalparagraph, width)

    partext = re.sub(r'\n+$', '', partext)
    partext = re.sub(r'^\n+', '', partext)
    if title is not None:
        partext = name + '\n' + ('-' * len(name)) + '\n' + partext

    return partext + "\n\n"


def handleBlockQuote(block, width):
    text = handleElement(block, width - BLOCK_QUOTE)
    text = VertCatString(" " * BLOCK_QUOTE,
                         BLOCK_QUOTE, text)

    return text + "\n"


def handleLink(link, width):
    linkend = link.getAttribute('linkend')
    if linkend not in ANCHOR_HASH:
        print >> STDERR, "Warning: Link ID [%s] not found in TOC" % linkend
    text = handleElement(link, width)
    anchorpt = ANCHOR_HASH.get(linkend)
    if not anchorpt:
        anchorpt = ''

    return text + ' [|' + anchorpt + '|]'


def handleAnchor(anchor, width):
    anchorText = '*' + anchor.getAttribute('id') + '* '
    anchorText += '*' + ANCHOR_HASH[anchor.getAttribute('id')] + '*'
    return RightJustify(anchorText, width) + "\n"


def handleSection(section, width):
    title = section.getChildrenByTagName('title')[0]
    name = handleElement(title, width)

    sectionid = section.getAttribute('id')
    tagsformatted = ''
    if sectionid in TOC_HASH:
        tagsformatted = '*%s* ' % TOC_HASH[sectionid]

    if sectionid in ANCHOR_HASH:
        tagsformatted += '*%s* ' % ANCHOR_HASH[sectionid]

    if sectionid and sectionid in TOC_HASH and \
            sectionid != TOC_HASH[sectionid]:
        tagsformatted += '*%s*' % sectionid

    # try to indent to a width of 20
    tagsformatted = RightJustify(IndentParagraphs(tagsformatted, 30), 0)
    tagswidth = TextWidth(tagsformatted)

    # width(name) + nspaces + width(tags) = 80
    if len(tagsformatted) > 2:
        header = VertCatString(name, 80 - tagswidth, tagsformatted)
    else:
        header = name

    section.removeChild(title)
    text = handleElement(section, width)

    thislevel = LEVEL_HASH.get(sectionid, -1)
    if thislevel == 0:
        delim = '='
        newlines = '\n\n'
    elif thislevel == 1:
        delim = '-'
        newlines = '\n'
    else:
        delim = ''
        newlines = '\n'

    thisTOC = ''
    if thislevel <= 1:
        thisTOC = makeTOC(section, width, maxlevel=1)

    return "\n" + (delim * TEXT_WIDTH) + \
        "\n" + header + newlines + thisTOC + newlines + re.sub(
            r'\n+$', '', text) + "\n"


def handleUlink(ulink, width):
    url = ulink.getAttribute('url')
    text = handleElement(ulink)
    # URL_HASH is created at the very beginning
    if url:
        return text + ' |%s|' % URL_HASH[url]
    else:
        print >> STDERR, "Warning: url attribute empty for [%s]" % text
        return text


def handleIndexTerm(indexterm, width):
    return ''


def handleEmphasis(emphasis, width):
    return '_' + GetText(emphasis.childNodes) + '_'

###############################################################################
# A dictionary for mapping xml tags to functions.
###############################################################################
handlerMaps = {
    'articleinfo': handleArticleInfo,
    'table': handleTableRoot,
    'informaltable': handleTableRoot,
    'code': handleCode,
    'programlisting': handleCode,
    'list': handleList,
    'simplelist': handleList,
    'orderedlist': handleList,
    'para': handleParagraph,
    'formalpara': handleFormalParagraph,
    'note': handleNote,
    'link': handleLink,
    'anchor': handleAnchor,
    'section': handleSection,
    'blockquote': handleBlockQuote,
    'ulink': handleUlink,
    'emphasis': handleEmphasis,
    'indexterm': handleIndexTerm
}
inlineTags = {'tag': 1, 'literal': 1, 'link': 1,
              'ulink': 1, 'citetitle': 1, 'indexterm': 1,
              'emphasis': 1, 'filename': 1}

# helper functions for usage() and printerr()


def usage():
    print __doc__


def printerr(statement):
    if DEBUG:
        print >> STDERR, statement


def replaceComment(matchobj):
    initspace = matchobj.group(1)
    firstsent = matchobj.group(2)
    code = matchobj.group(3)

    if len(initspace) > 0:
        if initspace[0] == '<':
            lastspace = initspace
        else:
            lastspace = '<' + initspace[:-1]
    else:
        lastspace = initspace

    return '\n' + initspace + firstsent + ' >\n' + code + '\n' + lastspace


if __name__ == "__main__":
    option = {}
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'ds', ['prefix=', 'help'])
        for oa, ov in opts:
            option[oa] = ov

    except getopt.GetoptError:
        print >> STDERR, "Usage error: db2vim --help for usage"
        sys.exit(1)

    if '--help' in option:
        usage()
        sys.exit(0)

    TOC_PREFIX = option.get('--prefix', 'ls_')
    DEBUG = '-d' in option

    if len(args) != 1:
        print >> STDERR, "Usage error: db2vim --help for usage"
        sys.exit(1)

    fileName = args[0]
    FILENAME = re.sub(r'\.\w+$', r'.txt', fileName)

    try:
        fp = open(fileName)
    except:
        print "Error opening xml file"

    dom = xml.dom.minidom.parse(fp)

    modeline = r'''
================================================================================
About this file

This file was created automatically from its XML variant using db2vim. db2vim is
a python script which understands a very limited subset of the Docbook XML 4.2
DTD and outputs a plain text file in vim help format.

db2vim can be obtained via anonymous CVS from sourceforge.net. Use

cvs -d:pserver:anonymous@cvs.vim-latex.sf.net:/cvsroot/vim-latex co db2vim

Or you can visit the web-interface to sourceforge CVS at:
http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/vim-latex/db2vim/

The following modelines should nicely fold up this help manual.

vim:ft=help:fdm=expr:nowrap
vim:foldexpr=getline(v\:lnum-1)=~'-\\{80}'?'>2'\:getline(v\:lnum-1)=~'=\\{80}'?'>1'\:getline(v\:lnum)=~'=\\{80}'?'0'\:getline(v\:lnum)=~'-\\{80}'?'1'\:'='
vim:foldtext=substitute(v\:folddashes.substitute(getline(v\:foldstart),'\\s*\\*.*',"",""),'^--','\ \ \ \ \ \ ','')
================================================================================'''

    STRICT = '-s' in option

    pattern = re.compile(
        r'\n([< ]*)([^\n]+)&codebegin;\n(.*?)&codeend;', re.DOTALL)

    processedDoc = handleElement(dom.documentElement)
    while re.search('&codebegin;', processedDoc):
        processedDoc = re.sub(pattern, replaceComment, processedDoc)

    urlsection = r"""
================================================================================
URLs used in this file

"""
    labels = zip(URL_HASH.values(), URL_HASH.keys())
    labels.sort()
    for label, url in labels:
        urlsection += '*%s* : %s\n' % (label, url)

    processedDoc = processedDoc + urlsection + modeline
    print processedDoc.encode('iso-8859-1')
# vim:et:sts=4
