Source code for scrapple.selectors.selector
"""
scrapple.selectors.selector
~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
from __future__ import print_function
import random
from scrapple.utils.text import make_ascii
import requests
from lxml import etree
from lxml.etree import XPathError
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
requests.warnings.filterwarnings('ignore')
[docs]class Selector(object):
"""
This class defines the basic ``Selector`` object.
"""
__selector_type__ = ''
def __init__(self, url):
"""
The URL of the web page to be loaded is validated - ensuring the schema has \
been specified, and that the URL is valid. A HTTP GET request is made to load \
the web page, and the HTML content of this fetched web page is used to generate \
the :ref:`element tree <concepts-structure>`. This is the element tree that will \
be parsed to extract the necessary content.
"""
try:
headers = {
'content-encoding': 'gzip',
'Accept-Encoding': 'identity, compress, gzip',
'Accept': '*/*'
}
headers['User-Agent'] = random.choice([
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 6.1; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 5.1; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36',
'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
])
self.url = url
self.content = requests.get(url, headers=headers).content
self.tree = etree.HTML(self.content)
except requests.exceptions.MissingSchema:
raise Exception('URL should be of the form "http://<page_link>')
except requests.exceptions.InvalidURL:
raise Exception('The URL provided is invalid')
except requests.exceptions.ConnectionError:
raise Exception('Ensure that you are connected to the Internet and that the page exists')
def get_tree_tag(self, selector='', get_one=False, *args, **kwargs):
raise NotImplementedError
[docs] def extract_content(self, selector='', attr='', default='', connector='', *args, **kwargs):
"""
Method for performing the content extraction for the particular selector type. \
If the selector is "url", the URL of the current web page is returned.
Otherwise, the selector expression is used to extract content. The particular \
attribute to be extracted ("text", "href", etc.) is specified in the method \
arguments, and this is used to extract the required content. If the content \
extracted is a link (from an attr value of "href" or "src"), the URL is parsed \
to convert the relative path into an absolute path.
If the selector does not fetch any content, the default value is returned. \
If no default value is specified, an exception is raised.
:param selector: The XPath expression
:param attr: The attribute to be extracted from the selected tag
:param default: The default value to be used if the selector does not return any data
:param connector: String connector for list of data returned for a particular selector
:return: The extracted content
"""
try:
if selector.lower() == "url":
return self.url
if attr.lower() == "text":
tag = self.get_tree_tag(selector=selector, get_one=True)
content = connector.join([make_ascii(x).strip() for x in tag.itertext()])
content = content.replace("\n", " ").strip()
else:
tag = self.get_tree_tag(selector=selector, get_one=True)
content = tag.get(attr)
if attr in ["href", "src"]:
content = urljoin(self.url, content)
return content
except IndexError:
if default is not "":
return default
raise Exception("There is no content for the %s selector - %s" % (self.__selector_type__, selector))
except XPathError:
raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
[docs] def extract_links(self, selector='', *args, **kwargs):
"""
Method for performing the link extraction for the crawler. \
The selector passed as the argument is a selector to point to the anchor tags \
that the crawler should pass through. A list of links is obtained, and the links \
are iterated through. The relative paths are converted into absolute paths and \
a ``XpathSelector``/``CssSelector`` object (as is the case) is created with the URL of the next page as the argument \
and this created object is yielded.
The extract_links method basically generates ``XpathSelector``/``CssSelector`` objects for all of \
the links to be crawled through.
:param selector: The selector for the anchor tags to be crawled through
:return: A ``XpathSelector``/``CssSelector`` object for every page to be crawled through
"""
try:
links = self.get_tree_tag(selector=selector)
for link in links:
next_url = urljoin(self.url, link.get('href'))
yield type(self)(next_url)
except XPathError:
raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
except Exception:
raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
[docs] def extract_tabular(self, header='', prefix='', suffix='', table_type='', *args, **kwargs):
"""
Method for performing the tabular data extraction. \
:param result: A dictionary containing the extracted data so far
:param table_type: Can be "rows" or "columns". This determines the type of table to be extracted. \
A row extraction is when there is a single row to be extracted and mapped to a set of headers. \
A column extraction is when a set of rows have to be extracted, giving a list of header-value mappings.
:param header: The headers to be used for the table. This can be a list of headers, or a selector that gives the list of headers
:param prefix: A prefix to be added to each header
:param suffix: A suffix to be added to each header
:param selector: For row extraction, this is a selector that gives the row to be extracted. \
For column extraction, this is a list of selectors for each column.
:param attr: The attribute to be extracted from the selected tag
:param default: The default value to be used if the selector does not return any data
:param verbosity: The verbosity set as the argument for scrapple run
:return: A 2-tuple containing the list of all the column headers extracted and the list of \
dictionaries which contain (header, content) pairs
"""
if type(header) in [str, unicode]:
try:
header_list = self.get_tree_tag(header)
table_headers = [prefix + h.text + suffix for h in header_list]
except XPathError:
raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
except Exception:
raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
else:
table_headers = [prefix + h + suffix for h in header]
if len(table_headers) == 0:
raise Exception("Invalid %s selector for table header - %s" % (self.__selector_type__, header))
if table_type not in ["rows", "columns"]:
raise Exception("Specify 'rows' or 'columns' in table_type")
if table_type == "rows":
result_list = self.extract_rows(table_headers=table_headers, *args, **kwargs)
else:
result_list = self.extract_columns(table_headers=table_headers, *args, **kwargs)
return table_headers, result_list
[docs] def extract_rows(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
"""
Row data extraction for extract_tabular
"""
result_list = []
try:
values = self.get_tree_tag(selector)
if len(table_headers) >= len(values):
from itertools import izip_longest
pairs = izip_longest(table_headers, values, fillvalue=default)
else:
from itertools import izip
pairs = izip(table_headers, values)
for head, val in pairs:
if verbosity > 1:
print("\nExtracting", head, "attribute", sep=' ', end='')
if attr.lower() == "text":
try:
content = connector.join([make_ascii(x).strip() for x in val.itertext()])
except Exception:
content = default
content = content.replace("\n", " ").strip()
else:
content = val.get(attr)
if attr in ["href", "src"]:
content = urljoin(self.url, content)
result[head] = content
result_list.append(result)
except XPathError:
raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
except TypeError:
raise Exception("Selector expression string to be provided. Got " + selector)
return result_list
[docs] def extract_columns(self, result={}, selector='', table_headers=[], attr='', connector='', default='', verbosity=0, *args, **kwargs):
"""
Column data extraction for extract_tabular
"""
result_list = []
try:
if type(selector) in [str, unicode]:
selectors = [selector]
elif type(selector) == list:
selectors = selector[:]
else:
raise Exception("Use a list of selector expressions for the various columns")
from itertools import izip, count
pairs = izip(table_headers, selectors)
columns = {}
for head, selector in pairs:
columns[head] = self.get_tree_tag(selector)
try:
for i in count(start=0):
r = result.copy()
for head in columns.keys():
if verbosity > 1:
print("\nExtracting", head, "attribute", sep=' ', end='')
col = columns[head][i]
if attr == "text":
try:
content = connector.join([make_ascii(x).strip() for x in col.itertext()])
except Exception:
content = default
content = content.replace("\n", " ").strip()
else:
content = col.get(attr)
if attr in ["href", "src"]:
content = urljoin(self.url, content)
r[head] = content
result_list.append(r)
except IndexError:
pass
except XPathError:
raise Exception("Invalid %s selector - %s" % (self.__selector_type__, selector))
except TypeError:
raise Exception("Selector expression string to be provided. Got " + selector)
return result_list