HDB resale prices

import os, sys, time, datetime, re from scrapy.http import HtmlResponse #Enter file path filename = r'targeted file location' with open(filename,'r') as f: html = f.read() response = HtmlResponse(url="my HTML string", body=html) # Key line to allow Scrapy to parse the page item = dict() for sel in response.xpath("//tr")[10:]: item['id'] = sel.xpath('td/text()')[0].extract() item['block_add'] = sel.xpath('td/a/span/text()')[0].extract() individual_block_link = sel.xpath('td/a/@href')[0].extract() item['individual_block_link'] = response.urljoin(individual_block_link) item['date'] = sel.xpath('td/text()')[3].extract() price = sel.xpath('td/text()')[4].extract() price = int(price.replace(',','')) price_k = price/1000 item['price'] = price item['price_k'] = price_k item['size'] = sel.xpath('td/text()')[5].extract() item['psf'] = sel.xpath('td/text()')[6].extract() #agent = sel.xpath('td/a/span/text()')[1].extract() item['org_url_str'] = response.url for k, v in item.iteritems(): print k, v

def parse(self, response): for sel in response.xpath("//tr")[10:]: item = ScrapePropertyguruItem() item['id'] = sel.xpath('td/text()')[0].extract() item['block_add'] = sel.xpath('td/a/span/text()')[0].extract() individual_block_link = sel.xpath('td/a/@href')[0].extract() item['individual_block_link'] = response.urljoin(individual_block_link) item['date'] = sel.xpath('td/text()')[3].extract() price = sel.xpath('td/text()')[4].extract() price = int(price.replace(',','')) price_k = price/1000 item['price'] = price item['price_k'] = price_k item['size'] = sel.xpath('td/text()')[5].extract() item['psf'] = sel.xpath('td/text()')[6].extract() #agent = sel.xpath('td/a/span/text()')[1].extract() item['org_url_str'] = response.url yield item #get next page link next_page = response.xpath("//div/div[6]/div/a[10]/@href") if next_page: page_url = response.urljoin(next_page[0].extract()) yield scrapy.Request(page_url, self.parse)

This post is more suited for Singapore context with the aim of retrieving the Housing Development Board (HDB) resale prices for the year 2015 grouped by different parts of Singapore. All the prices information are retrieved from the HDB main website. The website retrieves the past 1 yr records for each block or by postcode. Hence, in order to retrieve all the records, one would need to retrieve all the postcode in Singapore first. Below outline the list of information required in order to form the full picture.

Retrieve the full postcode from following sg postcode database.
The above only have postcode, next will have to merge the postcode to the actual address. This website also provide the search of post code and retrieve the corresponding address. You can automate using the same process with python, python pattern and pandas.
Retrieve the HDB resale prices by iterating all the postcode retrieved from above.
The optional steps will also be retrieving the Geocodes correspond to the post code so all the data can be put into a map. This post “Retrieving Geocodes from ZipCodes using Python and Selenium” describes the retrieval method.

The 1st code snippet will be applied to item 1, i.e., retrieving the post code. For item 2, it is a two steps process, first have to search the postcode, get the link and from the link, retrieve the address.


import pandas as pd
from pattern.web import  URL, extension

def retrieve_postal_code_fr_web_1(target_url, savefilelocation):
    """ 
        target_url (str): url from function.
        savefilelocation (str): full file path.
    """
    savefile = target_url.split('=')[-1] + '.csv'
    fullsavefile = os.path.join(savefilelocation,savefile)
    
    contents = URL(target_url).download()

    w = pd.read_html(contents)
    w[0].to_csv(fullsavefile, index =False)

The next snippet will describe the method to retrieve the HDB resale prices. By exploring the HDB website, the dataset are in the xml format, The url are as followed: http://services2.hdb.gov.sg/webapp/BB33RTIS/BB33SResaleTransMap?postal=<postcode>. For easy retrieval of data in xml format, one way is to convert the xml to dict form and then convert to pandas dataframe object from the dict. This python module xmltodict will serve the required function.


import re, os, sys, datetime, time
import pandas as pd
import pattern
import xmltodict

from pattern.web import  URL, extension

class HDBResalesQuery(object):
    """ 
        For retrieving the resales prices from HDB webpage.
    """
    def __init__(self):
        """ List of url parameters -- for url formation """
        self.com_data_start_url = 'http://services2.hdb.gov.sg/webapp/BB33RTIS/BB33SResaleTransMap?postal='
        self.postal_portion_url = ''
        self.com_data_full_url = ''
        self.postal_list = [] #multiple postal code list

        ## storage
        self.single_postal_df = pd.DataFrame()
        self.multi_postal_df = pd.DataFrame()

        ## debugging
        self.en_print = 1
        
    def set_postal_code(self, postalcode):
        """ Set the postal code to url part.
            Set to self.postal_portion_url.
            Args:
                postalcode (str): can be str or int??
        """
        self.postal_portion_url = str(postalcode)

    def set_postal_code_list(self, postalcodelist):
        """ Set list of postal code. Set to self.postal_list
            Args:
                postalcodelist(list): list of postal code
        """
        self.postal_list = postalcodelist

    def form_url_str(self):
        """ Form the url str necessary to get the xml

        """           
        self.com_data_full_url = self.com_data_start_url + self.postal_portion_url
        
    def get_com_data(self):
        """ Combine the url str and get html contents
        """
        self.form_url_str()
        if self.en_print: print self.com_data_full_url
        contents = URL(self.com_data_full_url).download()
        return contents

    def process_single_postal_code(self):
        """ process single postal code and retrieve the relevant information from HDB.

        """
        contents = self.get_com_data()
        if self.en_print: print contents
        obj = xmltodict.parse(contents)

        data_dict_list = []
        if obj['Datasets'].has_key('Dataset'):
            data_set = obj['Datasets']['Dataset']
            if type(data_set) == list:
                for single_data in data_set:
                    data_dict_list.append(dict(single_data))
            else:
                data_dict_list.append(dict(data_set))
        
        #Can convert to pandas dataframe w = pd.DataFrame(data_dict_list)
        self.single_postal_df = pd.DataFrame(data_dict_list)
        if self.en_print: print self.single_postal_df

    def process_mutli_postal_code(self):
        """ for processing multiple postal code.
        """
        self.multi_postal_df = pd.DataFrame()
        
        for postalcode in self.postal_list:
            if self.en_print: print 'processing postalcode: ', postalcode
            self.set_postal_code(postalcode)
            self.process_single_postal_code()
            if len(self.single_postal_df) == 0: #no data
                continue
            if len(self.multi_postal_df) == 0:
                self.multi_postal_df = self.single_postal_df
            else:
                self.multi_postal_df = self.multi_postal_df.append(self.single_postal_df)

            

if __name__ == '__main__':
        """ Trying out the class"""
        postallist = ['640525','180262']
        w = HDBResalesQuery()
        w.set_postal_code_list(postallist)
        w.process_mutli_postal_code()
        print w.multi_postal_df

Note that all the processes require large number of queries (110k) to the website. It is best to schedule it to retrieve in batches or the website will shut you out (identify you as a bot).

The following is the Tableau representation of all the data. It is still a prelim version.