1  """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  import re 
  7  from lxml import etree, html 
  8   
  9  try: 
 10      from bs4 import ( 
 11          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 12          Declaration, CData, Doctype) 
 13      _DECLARATION_OR_DOCTYPE = (Declaration, Doctype) 
 14  except ImportError: 
 15      from BeautifulSoup import ( 
 16          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, 
 17          Declaration, CData) 
 18      _DECLARATION_OR_DOCTYPE = Declaration 
 19   
 20   
 21 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): 
  22      """Parse a string of HTML data into an Element tree using the 
 23      BeautifulSoup parser. 
 24   
 25      Returns the root ``<html>`` Element of the tree. 
 26   
 27      You can pass a different BeautifulSoup parser through the 
 28      `beautifulsoup` keyword, and a diffent Element factory function 
 29      through the `makeelement` keyword.  By default, the standard 
 30      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 31      used. 
 32      """ 
 33      return _parse(data, beautifulsoup, makeelement, **bsargs) 
  34   
 35   
 36 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): 
  37      """Parse a file into an ElemenTree using the BeautifulSoup parser. 
 38   
 39      You can pass a different BeautifulSoup parser through the 
 40      `beautifulsoup` keyword, and a diffent Element factory function 
 41      through the `makeelement` keyword.  By default, the standard 
 42      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 43      used. 
 44      """ 
 45      if not hasattr(file, 'read'): 
 46          file = open(file) 
 47      root = _parse(file, beautifulsoup, makeelement, **bsargs) 
 48      return etree.ElementTree(root) 
  49   
 52      """Convert a BeautifulSoup tree to a list of Element trees. 
 53   
 54      Returns a list instead of a single root Element to support 
 55      HTML-like soup with more than one root element. 
 56   
 57      You can pass a different Element factory through the `makeelement` 
 58      keyword. 
 59      """ 
 60      root = _convert_tree(beautiful_soup_tree, makeelement) 
 61      children = root.getchildren() 
 62      for child in children: 
 63          root.remove(child) 
 64      return children 
  65   
 66   
 67   
 68   
 69 -def _parse(source, beautifulsoup, makeelement, **bsargs): 
  70      if beautifulsoup is None: 
 71          beautifulsoup = BeautifulSoup 
 72      if hasattr(beautifulsoup, "HTML_ENTITIES"):   
 73          if 'convertEntities' not in bsargs: 
 74              bsargs['convertEntities'] = 'html' 
 75      if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"):   
 76          if 'features' not in bsargs: 
 77              bsargs['features'] = ['html.parser']   
 78      tree = beautifulsoup(source, **bsargs) 
 79      root = _convert_tree(tree, makeelement) 
 80       
 81      if len(root) == 1 and root[0].tag == "html": 
 82          return root[0] 
 83      root.tag = "html" 
 84      return root 
  85   
 86   
 87  _parse_doctype_declaration = re.compile( 
 88      r'(?:\s|[<!])*DOCTYPE\s*HTML' 
 89      r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?' 
 90      r'(?:\s+(\'[^\']*\'|"[^"]*"))?', 
 91      re.IGNORECASE).match 
 95       
 97          self.name = 'html' 
 98          self.attrs = [] 
 99          self.contents = contents 
 100   
 103   
106      if makeelement is None: 
107          makeelement = html.html_parser.makeelement 
108   
109       
110       
111       
112       
113       
114       
115      first_element_idx = last_element_idx = None 
116      html_root = declaration = None 
117      for i, e in enumerate(beautiful_soup_tree): 
118          if isinstance(e, Tag): 
119              if first_element_idx is None: 
120                  first_element_idx = i 
121              last_element_idx = i 
122              if html_root is None and e.name and e.name.lower() == 'html': 
123                  html_root = e 
124          elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE): 
125              declaration = e 
126   
127       
128       
129       
130       
131       
132      pre_root = beautiful_soup_tree.contents[:first_element_idx] 
133      roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1] 
134      post_root = beautiful_soup_tree.contents[last_element_idx+1:] 
135   
136       
137      if html_root is not None: 
138           
139          i = roots.index(html_root) 
140          html_root.contents = roots[:i] + html_root.contents + roots[i+1:] 
141      else: 
142           
143          html_root = _PseudoTag(roots) 
144   
145      convert_node = _init_node_converters(makeelement) 
146   
147       
148      res_root = convert_node(html_root) 
149      prev = res_root 
150      for e in reversed(pre_root): 
151          converted = convert_node(e) 
152          if converted is not None: 
153              prev.addprevious(converted) 
154              prev = converted 
155   
156       
157      prev = res_root 
158      for e in post_root: 
159          converted = convert_node(e) 
160          if converted is not None: 
161              prev.addnext(converted) 
162              prev = converted 
163   
164      if declaration is not None: 
165          try: 
166               
167              doctype_string = declaration.output_ready() 
168          except AttributeError: 
169              doctype_string = declaration.string 
170   
171          match = _parse_doctype_declaration(doctype_string) 
172          if not match: 
173               
174               
175              pass 
176          else: 
177              external_id, sys_uri = match.groups() 
178              docinfo = res_root.getroottree().docinfo 
179               
180              docinfo.public_id = external_id and external_id[1:-1] 
181              docinfo.system_url = sys_uri and sys_uri[1:-1] 
182   
183      return res_root 
 184   
187      converters = {} 
188      ordered_node_types = [] 
189   
190      def converter(*types): 
191          def add(handler): 
192              for t in types: 
193                  converters[t] = handler 
194                  ordered_node_types.append(t) 
195              return handler 
 196          return add 
197   
198      def find_best_converter(node): 
199          for t in ordered_node_types: 
200              if isinstance(node, t): 
201                  return converters[t] 
202          return None 
203   
204      def convert_node(bs_node, parent=None): 
205           
206          try: 
207              handler = converters[type(bs_node)] 
208          except KeyError: 
209              handler = converters[type(bs_node)] = find_best_converter(bs_node) 
210          if handler is None: 
211              return None 
212          return handler(bs_node, parent) 
213   
214      def map_attrs(bs_attrs): 
215          if isinstance(bs_attrs, dict):   
216              attribs = {} 
217              for k, v in bs_attrs.items(): 
218                  if isinstance(v, list): 
219                      v = " ".join(v) 
220                  attribs[k] = unescape(v) 
221          else: 
222              attribs = dict((k, unescape(v)) for k, v in bs_attrs) 
223          return attribs 
224   
225      def append_text(parent, text): 
226          if len(parent) == 0: 
227              parent.text = (parent.text or '') + text 
228          else: 
229              parent[-1].tail = (parent[-1].tail or '') + text 
230   
231       
232   
233      @converter(Tag, _PseudoTag) 
234      def convert_tag(bs_node, parent): 
235          attrs = bs_node.attrs 
236          if parent is not None: 
237              attribs = map_attrs(attrs) if attrs else None 
238              res = etree.SubElement(parent, bs_node.name, attrib=attribs) 
239          else: 
240              attribs = map_attrs(attrs) if attrs else {} 
241              res = makeelement(bs_node.name, attrib=attribs) 
242   
243          for child in bs_node: 
244               
245              try: 
246                  handler = converters[type(child)] 
247              except KeyError: 
248                  pass 
249              else: 
250                  if handler is not None: 
251                      handler(child, res) 
252                  continue 
253              convert_node(child, res) 
254          return res 
255   
256      @converter(Comment) 
257      def convert_comment(bs_node, parent): 
258          res = etree.Comment(bs_node) 
259          if parent is not None: 
260              parent.append(res) 
261          return res 
262   
263      @converter(ProcessingInstruction) 
264      def convert_pi(bs_node, parent): 
265          if bs_node.endswith('?'): 
266               
267               
268              bs_node = bs_node[:-1] 
269          res = etree.ProcessingInstruction(*bs_node.split(' ', 1)) 
270          if parent is not None: 
271              parent.append(res) 
272          return res 
273   
274      @converter(NavigableString) 
275      def convert_text(bs_node, parent): 
276          if parent is not None: 
277              append_text(parent, unescape(bs_node)) 
278          return None 
279   
280      return convert_node 
281   
282   
283   
284   
285  try: 
286      from html.entities import name2codepoint   
287  except ImportError: 
288      from htmlentitydefs import name2codepoint 
289   
290   
291  handle_entities = re.compile("&(\w+);").sub 
295      if not string: 
296          return '' 
297       
298      def unescape_entity(m): 
299          try: 
300              return unichr(name2codepoint[m.group(1)]) 
301          except KeyError: 
302              return m.group(0)   
 303      return handle_entities(unescape_entity, string) 
304