Login
or
register
Overview
Introduction
Features
Credits
License
This Wiki
Status
News
Releases
Change Log
Recent Wiki Edits
User Guide
Installation
Command Line
Using as a Module
Integration
Reporting Bugs
Advanced
Mailing List
Source Code
Test Suite
Extensions
Extensions
Writing Extensions
Available Extensions
Related Projects
odt2txt_py
Anonymous users must enter
captcha
below.
Don't put anything here
Page Parameters
Page Name (for URL)
Page Title
Don't put anything here
Don't put anything here
Show Advanced Options
Category
Prototype
Page Content
#!/usr/bin/python ## To access this file as plain text please go to ## http://freewisdom.org/projects/python-markdown/odt2txt_py.raw_content ## After downloading save with ".py" extension """ ODT2TXT ======= ODT2TXT convers files in Open Document Text format (ODT) into Markdown-formatted plain text. Writteby by [Yuri Takhteyev](http://www.freewisdom.org). Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt Contact: yuri [at] freewisdom.org License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD Version: 0.1 (April 7, 2006) """ import sys, zipfile, xml.dom.minidom IGNORED_TAGS = ["office:annotation"] FOOTNOTE_STYLES = ["Footnote"] class TextProps : """ Holds properties for a text style. """ def __init__ (self): self.italic = False self.bold = False self.fixed = False def setItalic (self, value) : if value == "italic" : self.italic = True def setBold (self, value) : if value == "bold" : self.bold = True def setFixed (self, value) : self.fixed = value def __str__ (self) : return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic), str(self.bold), str(self.fixed)) class ParagraphProps : """ Holds properties of a paragraph style. """ def __init__ (self): self.blockquote = False self.headingLevel = 0 self.code = False self.title = False self.indented = 0 def setIndented (self, value) : self.indented = value def setHeading (self, level) : self.headingLevel = level def setTitle (self, value): self.title = value def setCode (self, value) : self.code = value def __str__ (self) : return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote), self.headingLevel, str(self.code)) class ListProperties : """ Holds properties for a list style. """ def __init__ (self): self.ordered = False def setOrdered (self, value) : self.ordered = value class OpenDocumentTextFile : def __init__ (self, filepath) : self.footnotes = [] self.footnoteCounter = 0 self.textStyles = {"Standard" : TextProps()} self.paragraphStyles = {"Standard" : ParagraphProps()} self.listStyles = {} self.fixedFonts = [] self.hasTitle = 0 self.load(filepath) def processFontDeclarations (self, fontDecl) : """ Extracts necessary font information from a font-declaration element. """ for fontFace in fontDecl.getElementsByTagName("style:font-face") : if fontFace.getAttribute("style:font-pitch") == "fixed" : self.fixedFonts.append(fontFace.getAttribute("style:name")) def extractTextProperties (self, style, parent=None) : """ Extracts text properties from a style element. """ textProps = TextProps() if parent : parentProp = self.textStyles.get(parent, None) if parentProp : textProp = parentProp textPropEl = style.getElementsByTagName("style:text-properties") if not textPropEl : return textProps textPropEl = textPropEl[0] italic = textPropEl.getAttribute("fo:font-style") bold = textPropEl.getAttribute("fo:font-weight") textProps.setItalic(italic) textProps.setBold(bold) if textPropEl.getAttribute("style:font-name") in self.fixedFonts : textProps.setFixed(True) return textProps def extractParagraphProperties (self, style, parent=None) : """ Extracts paragraph properties from a style element. """ paraProps = ParagraphProps() name = style.getAttribute("style:name") if name.startswith("Heading_20_") : level = name[11:] try : level = int(level) paraProps.setHeading(level) except : level = 0 if name == "Title" : paraProps.setTitle(True) paraPropEl = style.getElementsByTagName("style:paragraph-properties") if paraPropEl : paraPropEl = paraPropEl[0] leftMargin = paraPropEl.getAttribute("fo:margin-left") if leftMargin : try : leftMargin = float(leftMargin[:-2]) if leftMargin > 0.01 : paraProps.setIndented(True) except : pass textProps = self.extractTextProperties(style) if textProps.fixed : paraProps.setCode(True) return paraProps def processStyles(self, styleElements) : """ Runs through "style" elements extracting necessary information. """ for style in styleElements : name = style.getAttribute("style:name") if name == "Standard" : continue family = style.getAttribute("style:family") parent = style.getAttribute("style:parent-style-name") if family == "text" : self.textStyles[name] = self.extractTextProperties(style, parent) elif family == "paragraph": self.paragraphStyles[name] = ( self.extractParagraphProperties(style, parent)) def processListStyles (self, listStyleElements) : for style in listStyleElements : name = style.getAttribute("style:name") prop = ListProperties() if style.childNodes : if ( style.childNodes[0].tagName == "text:list-level-style-number" ) : prop.setOrdered(True) self.listStyles[name] = prop def load(self, filepath) : """ Loads an ODT file. """ zip = zipfile.ZipFile(filepath) styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml")) self.processFontDeclarations(styles_doc.getElementsByTagName( "office:font-face-decls")[0]) self.processStyles(styles_doc.getElementsByTagName("style:style")) self.processListStyles(styles_doc.getElementsByTagName( "text:list-style")) self.content = xml.dom.minidom.parseString(zip.read("content.xml")) self.processFontDeclarations(self.content.getElementsByTagName( "office:font-face-decls")[0]) self.processStyles(self.content.getElementsByTagName("style:style")) self.processListStyles(self.content.getElementsByTagName( "text:list-style")) def compressCodeBlocks(self, text) : """ Removes extra blank lines from code blocks. """ lines = text.split("\n") buffer = "" numLines = len(lines) for i in range(numLines) : if (lines[i].strip() or i == numLines-1 or i == 0 or not ( lines[i-1].startswith(" ") and lines[i+1].startswith(" ") ) ): buffer += "\n" + lines[i] return buffer def listToString (self, listElement) : buffer = "" styleName = listElement.getAttribute("text:style-name") props = self.listStyles.get(styleName, ListProperties()) i = 0 for item in listElement.childNodes : i += 1 if props.ordered : number = str(i) number = number + "." + " "*(2-len(number)) buffer += number + self.paragraphToString(item.childNodes[0], indent=3) else : buffer += "* " + self.paragraphToString(item.childNodes[0], indent=2) buffer += "\n\n" return buffer def toString (self) : """ Converts the document to a string. """ body = self.content.getElementsByTagName("office:body")[0] text = self.content.getElementsByTagName("office:text")[0] buffer = u"" paragraphs = [el for el in text.childNodes if el.tagName in ["text:p", "text:h", "text:list"]] for paragraph in paragraphs : if paragraph.tagName == "text:list" : text = self.listToString(paragraph) else : text = self.paragraphToString(paragraph) if text : buffer += text + "\n\n" if self.footnotes : buffer += "--------\n\n" for cite, body in self.footnotes : buffer += "[^%s]: %s\n\n" % (cite, body) return self.compressCodeBlocks(buffer) def textToString(self, element) : buffer = u"" for node in element.childNodes : if node.nodeType == xml.dom.Node.TEXT_NODE : buffer += node.nodeValue elif node.nodeType == xml.dom.Node.ELEMENT_NODE : tag = node.tagName if tag == "text:span" : text = self.textToString(node) if not text.strip() : return "" # don't apply styles to white space styleName = node.getAttribute("text:style-name") style = self.textStyles.get(styleName, None) #print styleName, str(style) if style.fixed : buffer += "`" + text + "`" continue if style : if style.italic and style.bold : mark = "***" elif style.italic : mark = "_" elif style.bold : mark = "**" else : mark = "" else : mark = "<" + styleName + ">" buffer += "%s%s%s" % (mark, text, mark) elif tag == "text:note" : cite = (node.getElementsByTagName("text:note-citation")[0] .childNodes[0].nodeValue) body = (node.getElementsByTagName("text:note-body")[0] .childNodes[0]) self.footnotes.append((cite, self.textToString(body))) buffer += "[^%s]" % cite elif tag in IGNORED_TAGS : pass elif tag == "text:s" : try : num = int(node.getAttribute("text:c")) buffer += " "*num except : buffer += " " elif tag == "text:tab" : buffer += " " elif tag == "text:a" : text = self.textToString(node) link = node.getAttribute("xlink:href") buffer += "[%s](%s)" % (text, link) else : buffer += " {" + tag + "} " return buffer def paragraphToString(self, paragraph, indent = 0) : style_name = paragraph.getAttribute("text:style-name") paraProps = self.paragraphStyles.get(style_name) #, None) text = self.textToString(paragraph) #print style_name if paraProps and not paraProps.code : text = text.strip() if paraProps.title : self.hasTitle = 1 return text + "\n" + ("=" * len(text)) if paraProps.headingLevel : level = paraProps.headingLevel if self.hasTitle : level += 1 if level == 1 : return text + "\n" + ("=" * len(text)) elif level == 2 : return text + "\n" + ("-" * len(text)) else : return "#" * level + " " + text elif paraProps.code : lines = [" %s" % line for line in text.split("\n")] return "\n".join(lines) if paraProps.indented : return self.wrapParagraph(text, indent = indent, blockquote = True) else : return self.wrapParagraph(text, indent = indent) def wrapParagraph(self, text, indent = 0, blockquote=False) : counter = 0 buffer = "" LIMIT = 50 if blockquote : buffer += "> " for token in text.split() : if counter > LIMIT - indent : buffer += "\n" + " "*indent if blockquote : buffer += "> " counter = 0 buffer += token + " " counter += len(token) return buffer if __name__ == "__main__" : odt = OpenDocumentTextFile(sys.argv[1]) #print odt.fixedFonts #sys.exit(0) #out = open("out.txt", "wb") unicode = odt.toString() out_utf8 = unicode.encode("utf-8") sys.stdout.write(out_utf8) #out.write(
Don't put anything here
About this Edit
Don't put anything here
Minor Edit
Edit Summary
Don't put anything here
Powered by
Sputnik
|
XHTML 1.1