Source code for scrapple.selectors.selector

"""
scrapple.selectors.selector
~~~~~~~~~~~~~~~~~~~~~~~~~~~

"""

from __future__ import print_function

import random

from scrapple.utils.text import make_ascii

import requests
from lxml import etree
from lxml.etree import XPathError

try:
	from urlparse import urljoin
except ImportError:
	from urllib.parse import urljoin


requests.warnings.filterwarnings('ignore')


[docs]class Selector(object): """ This class defines the basic ``Selector`` object. """ __selector_type__ = '' def __init__(self, url): """ The URL of the web page to be loaded is validated - ensuring the schema has \ been specified, and that the URL is valid. A HTTP GET request is made to load \ the web page, and the HTML content of this fetched web page is used to generate \ the :ref:`element tree <concepts-structure>`. This is the element tree that will \ be parsed to extract the necessary content. """ try: headers = { 'content-encoding': 'gzip', 'Accept-Encoding': 'identity, compress, gzip', 'Accept': '*/*' } headers['User-Agent'] = random.choice([ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 6.1; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 5.1; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1', 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0', 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36', 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', ]) self.url = url self.content = requests.get(url, headers=headers).content self.tree = etree.HTML(self.content) except requests.exceptions.MissingSchema: raise Exception('URL should be of the form "http://<page_link>') except requests.exceptions.InvalidURL: raise Exception('The URL provided is invalid') except requests.exceptions.ConnectionError: raise Exception('Ensure that you are connected to the Internet and that the page exists') def get_tree_tag(self, selector='', get_one=False, *args, **kwargs): raise NotImplementedError
[docs] def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs): """ Method for performing the content extraction for the particular selector type. \ If the selector is "url", the URL of the current web page is returned. Otherwise, the selector expression is used to extract content. The particular \ attribute to be extracted ("text", "href", etc.) is specified in the method \ arguments, and this is used to extract the required content. If the content \ extracted is a link (from an attr value of "href" or "src"), the URL is parsed \ to convert the relative path into an absolute path. If the selector does not fetch any content, the default value is returned. \ If no default value is specified, an exception is raised. :param selector: The XPath expression :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :param connector: String connector for list of data returned for a particular selector :return: The extracted content """ try: if selector.lower() == "url": return self.url if attr.lower() == "text": tag = self.get_tree_tag(selector=selector, get_one=True) content = connector.join([make_ascii(x).strip() for x in tag.itertext()]) content = content.replace("\n", " ").strip() else: tag = self.get_tree_tag(selector=selector, get_one=True) content = tag.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) return content except IndexError: if default is not "": return default raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector)) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
[docs] def extract_tabular(self, header='', prefix='', suffix='', table_type='', *args, **kwargs): """ Method for performing the tabular data extraction. \ :param result: A dictionary containing the extracted data so far :param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \ A row extraction is when there is a single row to be extracted and mapped to a set of headers. \ A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings. :param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers :param prefix: A prefix to be added to each header :param suffix: A suffix to be added to each header :param selector: For row extraction, this is a selector that gives the row to be extracted. \ For column extraction, this is a list of selectors for each column. :param attr: The attribute to be extracted from the selected tag :param default: The default value to be used if the selector does not return any data :param verbosity: The verbosity set as the argument for scrapple run :return: A 2-tuple containing the list of all the column headers extracted and the list of \ dictionaries which contain (header, content) pairs """ if type(header) in [str, unicode]: try: header_list = self.get_tree_tag(header) table_headers = [prefix + h.text + suffix for h in header_list] except XPathError: raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) except Exception: raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) else: table_headers = [prefix + h + suffix for h in header] if len(table_headers) == 0: raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header)) if table_type not in ["rows", "columns"]: raise Exception("Specify 'rows' or 'columns' in table_type") if table_type == "rows": result_list = self.extract_rows(table_headers=table_headers, *args, **kwargs) else: result_list = self.extract_columns(table_headers=table_headers, *args, **kwargs) return table_headers, result_list
[docs] def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Row data extraction for extract_tabular """ result_list = [] try: values = self.get_tree_tag(selector) if len(table_headers) >= len(values): from itertools import izip_longest pairs = izip_longest(table_headers, values, fillvalue=default) else: from itertools import izip pairs = izip(table_headers, values) for head, val in pairs: if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') if attr.lower() == "text": try: content = connector.join([make_ascii(x).strip() for x in val.itertext()]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = val.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) result[head] = content result_list.append(result) except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list
[docs] def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs): """ Column data extraction for extract_tabular """ result_list = [] try: if type(selector) in [str, unicode]: selectors = [selector] elif type(selector) == list: selectors = selector[:] else: raise Exception("Use a list of selector expressions for the various columns") from itertools import izip, count pairs = izip(table_headers, selectors) columns = {} for head, selector in pairs: columns[head] = self.get_tree_tag(selector) try: for i in count(start=0): r = result.copy() for head in columns.keys(): if verbosity > 1: print("\nExtracting", head, "attribute", sep=' ', end='') col = columns[head][i] if attr == "text": try: content = connector.join([make_ascii(x).strip() for x in col.itertext()]) except Exception: content = default content = content.replace("\n", " ").strip() else: content = col.get(attr) if attr in ["href", "src"]: content = urljoin(self.url, content) r[head] = content result_list.append(r) except IndexError: pass except XPathError: raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector)) except TypeError: raise Exception("Selector expression string to be provided. Got " + selector) return result_list