import sys import re from lxml import etree # requires lxml 2.0 from copy import deepcopy print "HTML5 Spec Splitter" absolute_uris = False use_html5lib_parser = False use_html5lib_serialiser = False file_args = [] for arg in sys.argv[1:]: if arg == '--absolute': absolute_uris = True elif arg == '--html5lib-parser': use_html5lib_parser = True elif arg == '--html5lib-serialiser': use_html5lib_serialiser = True else: file_args.append(arg) if len(file_args) != 2: print 'Run like "python [options] spec-splitter.py index multipage"' print '(The directory "multipage" must already exist)' print print 'Options:' print ' --absolute ............. convert relative URLs to absolute (e.g. for images)' print ' --html5lib-parser ...... use html5lib parser instead of lxml' print ' --html5lib-serialiser .. use html5lib serialiser instead of lxml' sys.exit() if use_html5lib_parser or use_html5lib_serialiser: import html5lib import html5lib.serializer import html5lib.treewalkers index_page = 'index' # The document is split on all

elements, plus the following specific elements # (which were chosen to split any pages that were larger than about 100-200KB, and # may need to be adjusted as the spec changes): split_exceptions = [ 'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure 'elements', # <-- dom 'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits', 'embedded-content-1', 'the-iframe-element', 'the-video-element', 'the-canvas-element', 'the-map-element', 'tabular-data', 'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms', 'interactive-elements', 'commands', 'common-idioms', 'selectors', # <-- semantics 'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers 'dnd', # <-- editing 'workers', 'network', 'web-messaging', 'webstorage', 'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax ] print "Parsing..." # Parse document if use_html5lib_parser: parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml')) doc = parser.parse(open(file_args[0]), encoding='utf-8') else: parser = etree.HTMLParser(encoding='utf-8') doc = etree.parse(open(file_args[0]), parser) print "Splitting..." doctitle = doc.find('.//title').text # Absolutise some references, so the spec can be hosted elsewhere if absolute_uris: for a in ('href', 'src'): for t in ('link', 'script', 'img'): for e in doc.findall('//%s[@%s]' % (t, a)): if e.get(a)[0] == '/': e.set(a, 'http://www.whatwg.org' + e.get(a)) else: e.set(a, 'http://www.whatwg.org/specs/web-apps/current-work/' + e.get(a)) # Extract the body from the source document original_body = doc.find('body') # Create an empty body, for the page content to be added into later default_body = etree.Element('body') if original_body.get('class'): default_body.set('class', original_body.get('class')) if original_body.get('onload'): default_body.set('onload', 'fixBrokenLink(); %s' % original_body.get('onload')) original_body.getparent().replace(original_body, default_body) # Extract the header, so we can reuse it in every page header = original_body.find('.//*[@class="head"]') # Make a stripped-down version of it short_header = deepcopy(header) del short_header[2:] # Extract the items in the TOC (remembering their nesting depth) def extract_toc_items(items, ol, depth): for li in ol.iterchildren(): for c in li.iterchildren(): if c.tag == 'a': assert c.get('href')[0] == '#' items.append( (depth, c.get('href')[1:], c) ) elif c.tag == 'ol': extract_toc_items(items, c, depth+1) toc_items = [] extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0) # Prepare the link-fixup script link_fixup_script = etree.XML('