#!/usr/bin/python
# -*- coding: utf-8 -*-
import datetime as dt
import logging
import re
import requests
from bs4 import BeautifulSoup
from trojmiastopl.utils import get_content_for_url
try:
from __builtin__ import unicode
except ImportError:
unicode = lambda x, *args: x
log = logging.getLogger(__file__)
[docs]def get_title(offer_markup):
""" Searches for offer title on offer page
:param offer_markup: Class "title-wrap" from offer page markup
:type offer_markup: str
:return: Title of offer or None if there is no title
:rtype: str, None
:except: Returns None when couldn't find title of offer page.
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
try:
return html_parser.find(id="ogl-title").text.strip()
except AttributeError:
return
[docs]def get_img_url(offer_markup):
""" Searches for images in offer markup
:param offer_markup: Id "gallery" from offer page markup
:type offer_markup: str
:return: Images of offer in list
:rtype: list
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
images = html_parser.find_all(class_="fancybox")
output = []
for img in images:
output.append(img.attrs["href"])
return output
[docs]def parse_region(offer_markup):
""" Parses region information
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Region of offer
:rtype: dict
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
parsed_address = html_parser.find(class_="address").find(class_="dd").contents
output = {"voivodeship": "Pomorskie", "city": None, "district": None}
output["city"] = str(parsed_address[0]).replace("\xa0", "")
# Just city
if len(parsed_address) == 1:
output["address"] = output["city"]
return output
district_parser = BeautifulSoup(str(parsed_address[1]), "html.parser")
district = district_parser.find("a")
# City, district, street
if district is not None and len(parsed_address) > 2:
output["district"] = district.text
output["address"] = "{0}, {1}, {2}".format(
output["city"],
output["district"],
str(parsed_address[3]).replace("\xa0", "")
)
# City, district
elif district is not None:
output["district"] = district.text
output["address"] = "{0}, {1}".format(
output["city"],
output["district"]
)
# City, street
else:
output["address"] = "{0}, {1}".format(output["city"], str(parsed_address[2]).replace("\xa0", ""))
return output
[docs]def get_month_num_for_string(value):
""" Map for polish month names
:param value: Month value
:type value: str
:return: Month number
:rtype: int
"""
value = value.lower()[:3]
return {
'sty': 1,
'lut': 2,
'mar': 3,
'kwi': 4,
'maj': 5,
'cze': 6,
'lip': 7,
'sie': 8,
'wrz': 9,
'paź': 10,
'lis': 11,
'gru': 12,
}.get(value)
[docs]def parse_date_to_timestamp(date):
""" Parses string date to unix timestamp
:param date: Date
:type date: str
:return: Unix timestamp
:rtype: int
"""
date_parts = date.split(' ')
month = get_month_num_for_string(date_parts[1])
year = int(date_parts[2])
day = int(date_parts[0])
date_added = dt.datetime(year=year, day=day, month=month)
return int((date_added - dt.datetime(1970, 1, 1)).total_seconds())
[docs]def parse_dates_and_id(offer_markup):
""" Searches for date of creating and date of last update of an offer. Additionally parses offer id number.
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Date added and date updated if found and offer id (id, added, updated)
:rtype: dict
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
parsed_details = html_parser.find_all("li")
output = {"updated": None}
for detail in parsed_details:
if "numer" in detail.text:
output["id"] = detail.span.text
elif "wprowadzenia" in detail.text:
output["added"] = parse_date_to_timestamp(detail.span.text)
elif "aktualizacja" in detail.text:
output["updated"] = parse_date_to_timestamp(detail.span.text)
return output
[docs]def get_surface(offer_markup):
""" Searches for surface in offer markup
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Surface or None if there is no surface
:rtype: float, None
:except: When there is no offer surface it will return None
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
try:
surface = html_parser.sup.parent.previous_sibling
return float(surface.replace("m2", "").strip().replace(",", ".").replace(" ", ""))
except AttributeError:
return
[docs]def get_apartment_type(offer_markup):
""" Searches for apartment type in offer markup
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Apartment type
:rtype: str
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
return html_parser.find(class_="rodzaj_nieruchomosci").find(class_="dd").text.strip()
[docs]def get_available_from(offer_markup):
""" Searches for available from in offer markup
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Available from or None if there is no information
:rtype: str, None
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
try:
return html_parser.find(class_="dostepne_od").find(class_="dd").text.strip()
except AttributeError:
return
[docs]def parse_description(description_markup):
""" Searches for offer description
:param description_markup: Class "ogl-description" from offer page markup
:type description_markup: str
:return: Offer description
:rtype: str
"""
html_parser = BeautifulSoup(description_markup, "html.parser").text
# \xa0 means no-break space symbol
return html_parser.split("$(function")[0].replace(" ", "").replace("\n", " ").replace("\r", "") \
.replace(u'\xa0', u' ').strip()
[docs]def get_furnished(offer_markup):
""" Searches if offer is marked as furnished or not
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Information is offer furnished
:rtype: bool
:except: If there is no information if offer is furnished it will return None
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
try:
furniture = html_parser.find(class_="umeblowane").text
if "tak" in furniture:
return True
except AttributeError:
return None
return False
[docs]def parse_flat_data(offer_markup):
""" Parses flat data from sidebar
:param offer_markup: Class "sidebar" from offer page markup
:type offer_markup: str
:return: Information about price, deposit, floor, number of rooms, date of built and
total count of floors in building
:rtype: dict
"""
html_parser = BeautifulSoup(offer_markup, "html.parser")
flat_data = {"pietro": None, "l_pokoi": None, "rok_budowy": None, "l_pieter": None, "cena": None, "kaucja": None}
for element in list(flat_data.keys()):
current = html_parser.find(class_=element)
if current is not None:
correct = current.text
if "parter" in correct:
correct = "0"
flat_data[element] = int("".join(re.findall(r'\d+', correct)))
return flat_data
[docs]def parse_poster_name(contact_markup):
""" Parses poster name
:param contact_markup: Class "contact-box" from offer page markup
:type contact_markup: str
:return: Poster name
:rtype: str
"""
html_parser = BeautifulSoup(contact_markup, "html.parser")
poster_name = html_parser.find(class_="name")
if poster_name is not None:
poster_name = poster_name.text.strip()
else:
poster_name = None
return poster_name
[docs]def parse_offer(url):
""" Parses data from offer page url
:param url: Url of current offer page
:type url: str
:return: Dictionary with all offer details
:rtype: dict
:except: If there is no offer title anymore - offer got deleted.
"""
log.debug(url)
response = get_content_for_url(url)
if response is None:
raise requests.HTTPError
html_parser = BeautifulSoup(response.content, "html.parser")
offer_content = str(html_parser.find(class_="title-wrap"))
title = get_title(offer_content)
if title is None:
log.warning("Offer {0} is not available anymore.".format(url))
return
images = get_img_url(str(html_parser.find(id="gallery")))
contact_content = str(html_parser.find(class_="contact-box"))
date_details = str(html_parser.find(class_="ogl-info-wrap"))
dates_id = parse_dates_and_id(date_details)
description = parse_description(str(html_parser.find(class_="ogl-description")))
offer_content = str(html_parser.find(id="sidebar"))
surface = get_surface(offer_content)
flat_data = parse_flat_data(offer_content)
address = parse_region(offer_content)
return {
"title": title,
"offer_id": dates_id["id"],
"type": get_apartment_type(offer_content),
"address": address["address"],
"voivodeship": address["voivodeship"],
"city": address["city"],
"district": address["district"],
"price": flat_data["cena"],
"currency": "PLN",
"deposit": flat_data["kaucja"],
"surface": surface,
"price/surface": round(flat_data["cena"] / surface) if surface else None,
"floor": flat_data["pietro"],
"floor_count": flat_data["l_pieter"],
"rooms": flat_data["l_pokoi"],
"built_date": flat_data["rok_budowy"],
"available_from": get_available_from(offer_content),
"furniture": get_furnished(offer_content),
"additional": get_additional_information(offer_content),
"poster_name": parse_poster_name(contact_content),
"date_added": dates_id["added"],
"date_updated": dates_id["updated"],
"date_added_readable": dt.datetime.fromtimestamp(dates_id["added"]).isoformat(),
"date_updated_readable": dt.datetime.fromtimestamp(dates_id["updated"]).isoformat()
if dates_id["updated"] else None,
"url": url,
"description": description,
"images": images
}