import _base class HeadingMatcher(_base.HeadingMatcher): """Smart span algorithm, based on an idea by Simon Pieters and Ben Millard Essentially, headings only apply as far down/across the table as there are no other headers with the same colspan/rowspan. This version also has support for the headers attribute and for the scope attribute""" def matchAll(self, table): """ The basic algorithm is: 1. For each cell in the table: 2. If the cell has a headers attribute which lists the id of one or more heading cells in the table, set those as the headers for the cell 3. Otherwise select the headers of the cell from the scope attribute of the headers 4: Return the cell -> headers mapping (dict) """ rv = {} self.table = table #Create a header -> cells mapping based on @scope or auto headers = {} for cell in table.iterCells(): if self.isHeading(cell): headers[cell] = self.associateCellsWithHeader(cell) #Invert the headers -> cells mapping to a cell -> headers mapping headers_dict = {} for k, v in headers.iteritems(): if v is None: continue for cell in v: if cell not in headers_dict: headers_dict[cell] = [k] else: headers_dict[cell].append(k) for cell in table.iterCells(): headers_attr_headers = self.headersAttrHeaders(cell) #If the cell has a headers attribute add those headers and no others if headers_attr_headers: rv[cell] = headers_attr_headers elif cell in headers_dict: rv[cell] = headers_dict[cell] else: rv[cell] = None return rv def isHeading(self, cell): """Is the current cell a heading. Here we assume all cells and no cells are headings""" return cell.isHeading def associateCellsWithHeader(self, header): """Return the cells associated with a header according to its scope; either via the smart span algorithm for scope in (auto, row, col) or by selecting all cells below/right of the header in the (row|col)groups it spans (scope in (rowgroup, colgroup)) """ scope = None if "scope" in header.element.attrib: scope = header.element.attrib["scope"].lower() if scope is None or scope not in ("row", "col", "rowgroup", "colgroup"): scope = "auto" cells = [] if scope == "auto": cells = self.getCellsFromAxes(header, ("row", "col")) elif scope == "row": cells = self.getCellsFromAxes(header, ("row",), skip_heading_only_axes=False) elif scope == "col": cells = self.getCellsFromAxes(header, ("col",), skip_heading_only_axes=False) elif scope == "rowgroup": groups = self.getHeaderGroups(header, "row") assert len(groups) == 1 cells = self.getCellsFromGroup(header, groups[0]) elif scope == "colgroup": groups = self.getHeaderGroups(header, "col") for group in groups: cells.extend([item for item in self.getCellsFromGroup(header, group) if item not in cells]) return cells def getCellsFromAxes(self, header, axes, skip_heading_only_axes=True): """ Get cells associated with a header using the smart span algorithm The algorthm is this: 1. cell_list be the list of cells with which header is associated 2. For each axis in axes: 3. let span be the number of rows spanned by header on axis 4. for each row or column spanned by header on axis: 5. If skip_heading_only_axes is set and all the cells on the current row/column are headings, go to step 4 for the next row/column 6. let data_found be false 7. let current_cell be the cell immediatley adjacent to the header on the current row/column 8. If current_cell is a heading: 9. If current_cell's span across the current axis is equal to span and data_cell_found is True then go to step XX 10. Otherwise, if current_cell's span across the current axis is greater than or equal to span add current_cell to cell_list 11. Otherwise current_cell is a data cell. Add current_cell to cell_list and set data_cell_found to be true 12. Return cell_list Notes: This does not associate a cell that overlaps with the header cell It is not clear that the handling of groups of headers in the middle of the table is sophisticated enough; however we deal with simple cases where the headers match those at the begginning of the axis """ cells = [] for axis in axes: if axis == "row": min_index = header.anchor[1] max_index = header.anchor[1] + header.rowspan else: min_index = header.anchor[0] max_index = header.anchor[0] + header.colspan span = axis + "span" for axis_index in xrange(min_index, max_index): heading_span = getattr(header, span) data_cell_found = False if axis == "row": start_index = (header.anchor[0]+header.colspan, axis_index) else: start_index = (axis_index, header.anchor[1]+header.rowspan) current_headings = [] #If all the cells in the row/col are headings, none apply to each other if skip_heading_only_axes: all_headings = True for cell in self.table.iterAxis(start_index, axis=axis, dir=1): all_headings = self.isHeading(cell) if not all_headings: break if all_headings: continue for cell in self.table.iterAxis(start_index, axis=axis, dir=1): if self.isHeading(cell): current_span = getattr(cell, span) if heading_span == current_span and data_cell_found: break elif heading_span >= current_span: cells.append(cell) elif not self.isHeading(cell): cells.append(cell) data_cell_found = True return cells def getCellsFromGroup(self, header, group): """Get all the matching cells for a heading that scopes a group Matching cells are those that lie below and to the right of the header in the group (assuming ltr)""" rv = [] for cell in group: if (cell.anchor[0] >= header.anchor[0] and cell.anchor[1] >= header.anchor[1] and cell != header): rv.append(cell) return rv def getHeaderGroups(self, cell, axis): """Get all (row|col)groups spanned by cell axis - row or col""" property_map = {"col":(0, "colgroups"), "row":(1, "rowgroups")} rv = [] idx, group_type = property_map[axis] for group in getattr(self.table, group_type): if (cell.anchor[idx] >= group.anchor[idx]): if cell.anchor[idx] < group.anchor[idx] + group.span: rv.append(group) else: if group.anchor[idx] < cell.anchor[idx] + getattr(cell, axis + "span"): rv.append(group) return rv