Source code for trojmiastopl.offer

#!/usr/bin/python
# -*- coding: utf-8 -*-

import datetime as dt
import logging
import re

import requests
from bs4 import BeautifulSoup

from trojmiastopl.utils import get_content_for_url

try:
    from __builtin__ import unicode
except ImportError:
    unicode = lambda x, *args: x

log = logging.getLogger(__file__)


[docs]def get_title(offer_markup):
    """ Searches for offer title on offer page

    :param offer_markup: Class "title-wrap" from offer page markup
    :type offer_markup: str
    :return: Title of offer or None if there is no title
    :rtype: str, None

    :except: Returns None when couldn't find title of offer page.
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    try:
        return html_parser.find(id="ogl-title").text.strip()
    except AttributeError:
        return


[docs]def get_img_url(offer_markup):
    """ Searches for images in offer markup

    :param offer_markup: Id "gallery" from offer page markup
    :type offer_markup: str
    :return: Images of offer in list
    :rtype: list
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    images = html_parser.find_all(class_="fancybox")
    output = []
    for img in images:
        output.append(img.attrs["href"])
    return output


[docs]def parse_region(offer_markup):
    """ Parses region information

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Region of offer
    :rtype: dict
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    parsed_address = html_parser.find(class_="address").find(class_="dd").contents
    output = {"voivodeship": "Pomorskie", "city": None, "district": None}
    output["city"] = str(parsed_address[0]).replace("\xa0", "")
    # Just city
    if len(parsed_address) == 1:
        output["address"] = output["city"]
        return output
    district_parser = BeautifulSoup(str(parsed_address[1]), "html.parser")
    district = district_parser.find("a")
    # City, district, street
    if district is not None and len(parsed_address) > 2:
        output["district"] = district.text
        output["address"] = "{0}, {1}, {2}".format(
            output["city"],
            output["district"],
            str(parsed_address[3]).replace("\xa0", "")
        )
    # City, district
    elif district is not None:
        output["district"] = district.text
        output["address"] = "{0}, {1}".format(
            output["city"],
            output["district"]
        )
    # City, street
    else:
        output["address"] = "{0}, {1}".format(output["city"], str(parsed_address[2]).replace("\xa0", ""))
    return output


[docs]def get_month_num_for_string(value):
    """ Map for polish month names

    :param value: Month value
    :type value: str
    :return: Month number
    :rtype: int
    """
    value = value.lower()[:3]
    return {
        'sty': 1,
        'lut': 2,
        'mar': 3,
        'kwi': 4,
        'maj': 5,
        'cze': 6,
        'lip': 7,
        'sie': 8,
        'wrz': 9,
        'paź': 10,
        'lis': 11,
        'gru': 12,
    }.get(value)


[docs]def parse_date_to_timestamp(date):
    """ Parses string date to unix timestamp

    :param date: Date
    :type date: str
    :return: Unix timestamp
    :rtype: int
    """
    date_parts = date.split(' ')
    month = get_month_num_for_string(date_parts[1])
    year = int(date_parts[2])
    day = int(date_parts[0])
    date_added = dt.datetime(year=year, day=day, month=month)
    return int((date_added - dt.datetime(1970, 1, 1)).total_seconds())


[docs]def parse_dates_and_id(offer_markup):
    """ Searches for date of creating and date of last update of an offer. Additionally parses offer id number.

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Date added and date updated if found and offer id (id, added, updated)
    :rtype: dict
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    parsed_details = html_parser.find_all("li")
    output = {"updated": None}
    for detail in parsed_details:
        if "numer" in detail.text:
            output["id"] = detail.span.text
        elif "wprowadzenia" in detail.text:
            output["added"] = parse_date_to_timestamp(detail.span.text)
        elif "aktualizacja" in detail.text:
            output["updated"] = parse_date_to_timestamp(detail.span.text)
    return output


[docs]def get_surface(offer_markup):
    """ Searches for surface in offer markup

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Surface or None if there is no surface
    :rtype: float, None

    :except: When there is no offer surface it will return None
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    try:
        surface = html_parser.sup.parent.previous_sibling
        return float(surface.replace("m2", "").strip().replace(",", ".").replace(" ", ""))
    except AttributeError:
        return


[docs]def get_apartment_type(offer_markup):
    """ Searches for apartment type in offer markup

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Apartment type
    :rtype: str
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    return html_parser.find(class_="rodzaj_nieruchomosci").find(class_="dd").text.strip()


[docs]def get_available_from(offer_markup):
    """ Searches for available from in offer markup

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Available from or None if there is no information
    :rtype: str, None
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    try:
        return html_parser.find(class_="dostepne_od").find(class_="dd").text.strip()
    except AttributeError:
        return


[docs]def get_additional_information(offer_markup):
    """ Searches for additional info and heating type

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Additional info with optional heating type
    :rtype: dict
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser").find(class_="description")
    additional_info = "".join([
        part.strip()
        for i, part in enumerate(html_parser.text.split('Dodatkowe informacje'))
        if i == 1
    ])
    heating = html_parser.find('div', class_="typ_ogrzewania")
    return {
        'heating': heating.find(class_="dd").text.strip() if heating else False,
        'balcony': 'balkon' in additional_info,
        'kitchen': 'kuchnia' in additional_info,
        'terrace': 'taras' in additional_info,
        'internet': 'internet' in additional_info,
        'elevator': 'winda' in additional_info,
        'car_parking': 'parkingowe' in additional_info,
        'disabled_facilities': 'podjazd' in additional_info,
        'mezzanine': 'antresola' in additional_info,
        'basement': 'piwnica' in additional_info,
        'duplex_apartment': 'dwupoziomowe' in additional_info,
        'garden': 'ogródek' in additional_info,
        'garage': 'garaż' in additional_info,
        'cable_tv': 'kablówka' in additional_info
    }


[docs]def parse_description(description_markup):
    """ Searches for offer description

    :param description_markup: Class "ogl-description" from offer page markup
    :type description_markup: str
    :return: Offer description
    :rtype: str
    """
    html_parser = BeautifulSoup(description_markup, "html.parser").text
    # \xa0 means no-break space symbol
    return html_parser.split("$(function")[0].replace("  ", "").replace("\n", " ").replace("\r", "") \
        .replace(u'\xa0', u' ').strip()


[docs]def get_furnished(offer_markup):
    """ Searches if offer is marked as furnished or not

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Information is offer furnished
    :rtype: bool

    :except: If there is no information if offer is furnished it will return None
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    try:
        furniture = html_parser.find(class_="umeblowane").text
        if "tak" in furniture:
            return True
    except AttributeError:
        return None
    return False


[docs]def parse_flat_data(offer_markup):
    """ Parses flat data from sidebar

    :param offer_markup: Class "sidebar" from offer page markup
    :type offer_markup: str
    :return: Information about price, deposit, floor, number of rooms, date of built and
    total count of floors in building
    :rtype: dict
    """
    html_parser = BeautifulSoup(offer_markup, "html.parser")
    flat_data = {"pietro": None, "l_pokoi": None, "rok_budowy": None, "l_pieter": None, "cena": None, "kaucja": None}
    for element in list(flat_data.keys()):
        current = html_parser.find(class_=element)
        if current is not None:
            correct = current.text
            if "parter" in correct:
                correct = "0"
            flat_data[element] = int("".join(re.findall(r'\d+', correct)))
    return flat_data


[docs]def parse_poster_name(contact_markup):
    """ Parses poster name

    :param contact_markup: Class "contact-box" from offer page markup
    :type contact_markup: str
    :return: Poster name
    :rtype: str
    """
    html_parser = BeautifulSoup(contact_markup, "html.parser")
    poster_name = html_parser.find(class_="name")
    if poster_name is not None:
        poster_name = poster_name.text.strip()
    else:
        poster_name = None
    return poster_name


[docs]def parse_offer(url):
    """ Parses data from offer page url

    :param url: Url of current offer page
    :type url: str
    :return: Dictionary with all offer details
    :rtype: dict

    :except: If there is no offer title anymore - offer got deleted.
    """
    log.debug(url)
    response = get_content_for_url(url)
    if response is None:
        raise requests.HTTPError
    html_parser = BeautifulSoup(response.content, "html.parser")
    offer_content = str(html_parser.find(class_="title-wrap"))
    title = get_title(offer_content)
    if title is None:
        log.warning("Offer {0} is not available anymore.".format(url))
        return
    images = get_img_url(str(html_parser.find(id="gallery")))
    contact_content = str(html_parser.find(class_="contact-box"))
    date_details = str(html_parser.find(class_="ogl-info-wrap"))
    dates_id = parse_dates_and_id(date_details)
    description = parse_description(str(html_parser.find(class_="ogl-description")))
    offer_content = str(html_parser.find(id="sidebar"))
    surface = get_surface(offer_content)
    flat_data = parse_flat_data(offer_content)
    address = parse_region(offer_content)
    return {
        "title": title,
        "offer_id": dates_id["id"],
        "type": get_apartment_type(offer_content),
        "address": address["address"],
        "voivodeship": address["voivodeship"],
        "city": address["city"],
        "district": address["district"],
        "price": flat_data["cena"],
        "currency": "PLN",
        "deposit": flat_data["kaucja"],
        "surface": surface,
        "price/surface": round(flat_data["cena"] / surface) if surface else None,
        "floor": flat_data["pietro"],
        "floor_count": flat_data["l_pieter"],
        "rooms": flat_data["l_pokoi"],
        "built_date": flat_data["rok_budowy"],
        "available_from": get_available_from(offer_content),
        "furniture": get_furnished(offer_content),
        "additional": get_additional_information(offer_content),
        "poster_name": parse_poster_name(contact_content),
        "date_added": dates_id["added"],
        "date_updated": dates_id["updated"],
        "date_added_readable": dt.datetime.fromtimestamp(dates_id["added"]).isoformat(),
        "date_updated_readable": dt.datetime.fromtimestamp(dates_id["updated"]).isoformat()
        if dates_id["updated"] else None,
        "url": url,
        "description": description,
        "images": images
    }