#!/usr/bin/python
# -*- coding: utf-8 -*-
import logging
from re import findall
import requests
from bs4 import BeautifulSoup
from scrapper_helpers.utils import flatten
from trojmiastopl.utils import get_content_for_url, get_url
log = logging.getLogger(__file__)
logging.basicConfig(level=logging.DEBUG)
[docs]def get_page_count(markup):
""" Reads total page number from trojmiasto.pl search page
:param markup: trojmiasto.pl search page markup
:type markup: str
:return: Total page number
:rtype: int
:except: If no page number was found - there is just one page.
"""
html_parser = BeautifulSoup(markup, "html.parser")
try:
return max(map(int, findall(r'\d+', html_parser.find(class_="navi-pages").text)))
except ValueError as e:
log.warning(e)
return 1
[docs]def get_page_count_for_filters(category, region=None, **filters):
""" Reads total page number for given search filters
:param category: Search category
:param region: Search region
:param filters: See :meth category.get_category for reference
:type category: str
:type region: str
:type filters: dict
:return: Total page number
:rtype: int
:except: If no page number was found - there is just one page.
"""
url = get_url(category, region, **filters)
response = get_content_for_url(url)
html_parser = BeautifulSoup(response.content, "html.parser")
try:
return max(map(int, findall(r'\d+', html_parser.find(class_="navi-pages").text)))
except ValueError as e:
log.warning(e)
return 1
[docs]def parse_offer_url(markup):
""" Searches for offer links in markup
:param markup: Search page markup
:type markup: str
:return: Url with offer
:rtype: str
"""
html_parser = BeautifulSoup(markup, "html.parser")
url = html_parser.find('a').attrs['href']
return url
[docs]def parse_available_offers(markup):
""" Collects all offer links on search page markup
:param markup: Search page markup
:type markup: str
:return: Links to offer on given search page
:rtype: list
"""
html_parser = BeautifulSoup(markup, "html.parser")
offers = html_parser.find_all(class_='ogl-head')
parsed_offers = [parse_offer_url(str(offer)) for offer in offers if offer]
return parsed_offers
[docs]def get_category(category, region=None, **filters):
""" Parses available offer urls from given category from every page
:param category: Search category
:param region: Search region
:param filters: Dictionary with additional filters. Following example dictionary contains every possible filter
with examples of it's values.
:Example:
input_dict = {
"offer_type": "Mieszkanie", # offer type. See :meth:`utils.decode_type' for reference
"cena[]": (300, None), # price (from, to). None if you don't want to pass one of arguments
"kaucja[]: (100,1000), # deposit
"cena_za_m2[]": (5, 100), # price/surface
"powierzchnia[]": (23, 300), # surface
"l_pokoi[]": (2, 5), # desired number of rooms
"pietro[]": (-1, 6), # desired floor, enum: from 1 to 49 and -1 (ground floor)
"l_pieter[]": (1, 10), # desired total number of floors in building
"rok_budowy[]": (2003, 2017), # date of built
"data_wprow": "1d" # date of adding offer. Available: 1d - today, 3d - 3 days ago, 1w - one week ago,
# 3w - 3 weeks ago
}
:type category: str
:type region: str
:type filters: dict
:return: List of all offers for given parameters
:rtype: list
"""
current_url = get_url(category, region, **filters)
url = current_url
parsed_urls, page = [], 0
response = get_content_for_url(url)
page_max = get_page_count(response.content)
while page < page_max:
if page != 0:
url = current_url + "?strona={0}".format(page)
log.debug(url)
response = get_content_for_url(url)
log.info("Loaded page {0} of offers".format(page + 1))
offers = parse_available_offers(response.content)
if offers is None:
break
parsed_urls.append(offers)
if page is None:
page = 1
page += 1
parsed_urls = list(flatten(parsed_urls))
log.info("Loaded {0} offers".format(str(len(parsed_urls))))
return parsed_urls
[docs]def get_offers_for_page(category, region, page, **filters):
""" Parses offers for one specific page of given category with filters.
:param category: Search category
:param region: Search region
:param page: Page number
:param filters: See :meth category.get_category for reference
:type category: str
:type region: str
:type page: int
:type filters: dict
:return: List of all offers for given page and parameters
:rtype: list
"""
try:
url = get_url(category, region, **filters) + "?strona={0}".format(page)
response = get_content_for_url(url)
except requests.HTTPError as e:
log.warning('Request failed. Error: {0}'.format(e))
raise requests.HTTPError
log.info("Loaded page {0} of offers".format(page))
offers = parse_available_offers(response.content)
log.info("Loaded {0} offers".format(str(len(offers))))
return offers