#!/usr/bin/env python
from htmllib import HTMLParser
from formatter import AbstractFormatter, DumbWriter
# HTML entitydefs get dropped out completely by the HTMLParser and
# I don't know why. This mapping is used to turn the important ones
# into mundane but readable ASCII punctuation. A shame...
entitymap = {
'”' : '"',
'“' : '"',
'’' : "'",
' ' : ' ',
}
import sys
if __name__ == "__main__" :
p = HTMLParser(AbstractFormatter(
DumbWriter(open(sys.argv[1] + '-tagless', 'w'))))
text = open(sys.argv[1]).read()
text = text.replace('', '] -->')
for k, v in entitymap.items() :
text = text.replace(k, v)
p.feed(text)
p.close()