import sys
import re
from lxml import etree # requires lxml 2.0
from copy import deepcopy
print "HTML5 Spec Splitter"
absolute_uris = False
use_html5lib_parser = False
use_html5lib_serialiser = False
file_args = []
for arg in sys.argv[1:]:
if arg == '--absolute':
absolute_uris = True
elif arg == '--html5lib-parser':
use_html5lib_parser = True
elif arg == '--html5lib-serialiser':
use_html5lib_serialiser = True
else:
file_args.append(arg)
if len(file_args) != 2:
print 'Run like "python [options] spec-splitter.py index multipage"'
print '(The directory "multipage" must already exist)'
print
print 'Options:'
print ' --absolute ............. convert relative URLs to absolute (e.g. for images)'
print ' --html5lib-parser ...... use html5lib parser instead of lxml'
print ' --html5lib-serialiser .. use html5lib serialiser instead of lxml'
sys.exit()
if use_html5lib_parser or use_html5lib_serialiser:
import html5lib
import html5lib.serializer
import html5lib.treewalkers
index_page = 'index'
# The document is split on all
elements, plus the following specific elements
# (which were chosen to split any pages that were larger than about 100-200KB, and
# may need to be adjusted as the spec changes):
split_exceptions = [
'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure
'elements', # <-- dom
'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
'embedded-content-1', 'the-iframe-element', 'the-video-element', 'the-canvas-element', 'the-map-element', 'tabular-data',
'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
'interactive-elements', 'commands', 'common-idioms', 'selectors', # <-- semantics
'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
'dnd', # <-- editing
'workers', 'network', 'web-messaging', 'webstorage',
'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
]
print "Parsing..."
# Parse document
if use_html5lib_parser:
parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
doc = parser.parse(open(file_args[0]), encoding='utf-8')
else:
parser = etree.HTMLParser(encoding='utf-8')
doc = etree.parse(open(file_args[0]), parser)
print "Splitting..."
doctitle = doc.find('.//title').text
# Absolutise some references, so the spec can be hosted elsewhere
if absolute_uris:
for a in ('href', 'src'):
for t in ('link', 'script', 'img'):
for e in doc.findall('//%s[@%s]' % (t, a)):
if e.get(a)[0] == '/':
e.set(a, 'http://www.whatwg.org' + e.get(a))
else:
e.set(a, 'http://www.whatwg.org/specs/web-apps/current-work/' + e.get(a))
# Extract the body from the source document
original_body = doc.find('body')
# Create an empty body, for the page content to be added into later
default_body = etree.Element('body')
if original_body.get('class'): default_body.set('class', original_body.get('class'))
if original_body.get('onload'): default_body.set('onload', 'fixBrokenLink(); %s' % original_body.get('onload'))
original_body.getparent().replace(original_body, default_body)
# Extract the header, so we can reuse it in every page
header = original_body.find('.//*[@class="head"]')
# Make a stripped-down version of it
short_header = deepcopy(header)
del short_header[2:]
# Extract the items in the TOC (remembering their nesting depth)
def extract_toc_items(items, ol, depth):
for li in ol.iterchildren():
for c in li.iterchildren():
if c.tag == 'a':
assert c.get('href')[0] == '#'
items.append( (depth, c.get('href')[1:], c) )
elif c.tag == 'ol':
extract_toc_items(items, c, depth+1)
toc_items = []
extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0)
# Prepare the link-fixup script
link_fixup_script = etree.XML('')
doc.find('head')[-1].tail = '\n '
doc.find('head').append(link_fixup_script)
link_fixup_script.tail = '\n '
# Stuff for fixing up references:
def get_page_filename(name):
return '%s.html' % name
# Finds all the ids and remembers which page they were on
id_pages = {}
def extract_ids(page, node):
if node.get('id'):
id_pages[node.get('id')] = page
for e in node.findall('.//*[@id]'):
id_pages[e.get('id')] = page
# Updates all the href="#id" to point to page#id
missing_warnings = set()
def fix_refs(page, node):
for e in node.findall('.//a[@href]'):
if e.get('href')[0] == '#':
id = e.get('href')[1:]
if id in id_pages:
if id_pages[id] != page: # only do non-local links
e.set('href', '%s#%s' % (get_page_filename(id_pages[id]), id))
else:
missing_warnings.add(id)
def report_broken_refs():
for id in sorted(missing_warnings):
print "warning: can't find target for #%s" % id
pages = [] # for saving all the output, so fix_refs can be called in a second pass
# Iterator over the full spec's body contents
child_iter = original_body.iterchildren()
def add_class(e, cls):
if e.get('class'):
e.set('class', e.get('class') + ' ' + cls)
else:
e.set('class', cls)
# Contents/intro page:
page = deepcopy(doc)
add_class(page.getroot(), 'split index')
page_body = page.find('body')
# Keep copying stuff from the front of the source document into this
# page, until we find the first heading that isn't class="no-toc"
for e in child_iter:
if e.getnext().tag == 'h2' and 'no-toc' not in (e.getnext().get('class') or '').split(' '):
break
page_body.append(e)
pages.append( (index_page, page, 'Front cover') )
# Section/subsection pages:
def first_elm(e):
for c in e.iterchildren(tag=etree.Element):
return c
return None
def should_split(e):
if e.tag == 'h2': return True
if e.get('id') in split_exceptions: return True
# handle wrapping
if e.tag == 'div':
c = first_elm(e)
if c:
if c.tag == 'h2': return True
if c.get('id') in split_exceptions: return True
return False
def get_heading_text_and_id(e):
if e.tag == 'div':
node = first_elm(e)
else:
node = e
title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
return title, node.get('id')
for heading in child_iter:
# Handle the heading for this section
title, name = get_heading_text_and_id(heading)
if name == index_page: name = 'section-%s' % name
print ' <%s> %s - %s' % (heading.tag, name, title)
page = deepcopy(doc)
add_class(page.getroot(), 'split chapter')
page_body = page.find('body')
page.find('//title').text = title + u' \u2014 ' + doctitle
# Add the header
page_body.append(deepcopy(short_header))
# Add the page heading
page_body.append(deepcopy(heading))
extract_ids(name, heading)
# Keep copying stuff from the source, until we reach the end of the
# document or find a header to split on
e = heading
while e.getnext() is not None and not should_split(e.getnext()):
e = child_iter.next()
extract_ids(name, e)
page_body.append(deepcopy(e))
pages.append( (name, page, title) )
# Fix the links, and add some navigation:
for i in range(len(pages)):
name, doc, title = pages[i]
fix_refs(name, doc)
if name == index_page: continue # don't add nav links to the TOC page
head = doc.find('head')
nav = etree.Element('nav')
nav.text = '\n '
nav.tail = '\n\n '
if i > 1:
href = get_page_filename(pages[i-1][0])
title = pages[i-1][2]
a = etree.XML(u'\u2190 %s' % (href, title))
a.tail = u' \u2013\n '
nav.append(a)
link = etree.XML('' % (href, title))
link.tail = '\n '
head.append(link)
a = etree.XML('Table of contents' % index_page)
a.tail = '\n '
nav.append(a)
link = etree.XML('' % index_page)
link.tail = '\n '
head.append(link)
if i != len(pages)-1:
href = get_page_filename(pages[i+1][0])
title = pages[i+1][2]
a = etree.XML(u'%s \u2192' % (href, title))
a.tail = '\n '
nav.append(a)
a.getprevious().tail = u' \u2013\n '
link = etree.XML('' % (href, title))
link.tail = '\n '
head.append(link)
# Add a subset of the TOC to each page:
# Find the items that are on this page
new_toc_items = [ (d, id, e) for (d, id, e) in toc_items if id_pages[id] == name ]
if len(new_toc_items) > 1: # don't bother if there's only one item, since it looks silly
# Construct the new toc
new_toc = etree.XML(u'')
cur_ol = new_toc
cur_li = None
cur_depth = 0
# Add each item, reconstructing the nested s and
s to preserve
# the nesting depth of each item
for (d, id, e) in new_toc_items:
while d > cur_depth:
if cur_li is None:
cur_li = etree.XML(u'
')
cur_ol.append(cur_li)
cur_ol = etree.XML('')
cur_li.append(cur_ol)
cur_li = None
cur_depth += 1
while d < cur_depth:
cur_li = cur_ol.getparent()
cur_ol = cur_li.getparent()
cur_depth -= 1
cur_li = etree.XML(u'')
cur_li.append(deepcopy(e))
cur_ol.append(cur_li)
nav.append(new_toc)
doc.find('body').insert(1, nav) # after the header
report_broken_refs()
print "Outputting..."
# Output all the pages
for name, doc, title in pages:
f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w')
if use_html5lib_serialiser:
tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
for text in serializer.serialize(tokens, encoding='us-ascii'):
f.write(text)
else:
f.write(etree.tostring(doc, pretty_print=False, method="html"))
# Generate the script to fix broken links
f = open('%s/fragment-links.js' % (file_args[1]), 'w')
links = ','.join('"%s":"%s"' % (k.replace("\\", "\\\\").replace('"', '\\"'), v) for (k,v) in id_pages.items())
f.write('var fragment_links = { ' + re.sub(r"([^\x20-\x7f])", lambda m: "\\u%04x" % ord(m.group(1)), links) + ' };\n')
f.write("""
var fragid = window.location.hash.substr(1);
if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */
var m = window.location.pathname.match(/\/(?:section-)?([\w\-]+)\.html/);
if (m) fragid = m[1];
}
var page = fragment_links[fragid];
if (page) {
window.location.replace(page+'.html#'+fragid);
}
""")
print "Done."