Month: January 2015

Get RSS feeds using python pattern

Python Pattern allows easy way to retrieve RSS feeds. The following script will act as a feeds reader and retrieve feeds from various sites, focusing on world news and related Singapore stock market in this example.

The pattern module has the NewsFeed() function that can take in RSS url and output the corresponding results.  The following is the description of the Newsfeed object from the pattern website “The Newsfeed object is a wrapper for Mark Pilgrim’s Universal Feed Parser. Newsfeed.search() takes the URL of an RSS or Atom news feed and returns a list of Result objects.”

This will return object that has the following attributes title, link and desc. The script below takes in a dict with the different categories as key. The value are the list of RSS url belonging to that category. The script will output results in the form of dict of categories and results of each category are segregated by date key. This script allows consolidation of different feeds from various RSS sources enabling the user to further process the feeds. The printing of the feeds can be limited by the set_last_desired_date() which display only results from a certain date.

import os, re, sys, time, datetime, copy, calendar
from pattern.web import URL, extension, cache, plaintext, Newsfeed

class FeedsReader(object):
    def __init__(self):

        #For grouping to various category
        self.rss_sites_by_category_dict = {
                                            'SG':   [
                                                        'http://feeds.theedgemarkets.com/theedgemarkets/sgtopstories.rss',
                                                        'http://feeds.theedgemarkets.com/theedgemarkets/sgmarkets.rss',
                                                        'http://feeds.theedgemarkets.com/theedgemarkets/sgproperty.rss',
                                                      ],
                                            'World':[
                                                        'http://www.ft.com/rss/home/asia',
                                                        'http://rss.cnn.com/rss/money_news_economy.rss',
                                                        'http://feeds.reuters.com/reuters/businessNews',
                                                      ],
                                            }
        self.rss_sites = []

        ## num of feeds to parse_per_site
        self.num_feeds_parse_per_site = 100

        ## individual group storage of feeds.
        self.rss_results_dict = {} # dict with date as key
        self.rss_title_list = []

        ## full results set consist of category
        self.rss_results_dict_by_cat ={} # dict of dict
        self.rss_title_list_by_cat = {}  # dict of list

    def set_rss_sites(self, rss_site_urls):
        """ Set to self.rss_sites.
            Args:
                rss_site_urls (list): list of rss site url for getting feeds.
        """
        self.rss_sites = rss_site_urls

    def convert_date_str_to_date_key(self, date_str):
        """ Convert the date str given by twiiter [created_at] to date key in format YYYY-MM-DD.
            Args:
                date_str (str): date str in format given by twitter. 'Mon Sep 29 07:00:10 +0000 2014'
            Returns:
                (int): date key in format YYYYMMDD
        """
        date_list = date_str.split()

        month_dict = {v: '0'+str(k) for k,v in enumerate(calendar.month_abbr) if k <10}
        month_dict.update({v:str(k) for k,v in enumerate(calendar.month_abbr) if k >=10})

        return int(date_list[3] + month_dict[date_list[2]] + date_list[1])

    def parse_rss_sites(self):
        """ Function to parse the RSS sites.
            Results are stored in self.rss_results_dict with date as key.
        """
        self.rss_results_dict = {}
        self.rss_title_list = []

        cache.clear()

        for rss_site_url in self.rss_sites:
            print "processing: ", rss_site_url
            for result in Newsfeed().search(rss_site_url)[:self.num_feeds_parse_per_site]:
                date_key = self.convert_date_str_to_date_key(result.date)
                self.rss_title_list.append(result.title)
                if self.rss_results_dict.has_key(date_key):
                    self.rss_results_dict[date_key].append([result.title,  plaintext(result.text)])
                else:
                    self.rss_results_dict[date_key] = [[result.title,  plaintext(result.text)]]
        print 'done'

    def parse_rss_sites_by_cat(self):
        """ Iterate over the list of categories and parse the list of rss sites.
        """
        self.rss_results_dict_by_cat ={} # dict of dict
        self.rss_title_list_by_cat = {}  # dict of list

        for cat in self.rss_sites_by_category_dict:
            print 'Processing Category: ', cat
            self.set_rss_sites(self.rss_sites_by_category_dict[cat])
            self.parse_rss_sites()
            self.rss_results_dict_by_cat[cat] = self.rss_results_dict
            self.rss_title_list_by_cat[cat] = self.rss_title_list

    def set_last_desired_date(self, num_days = 0):
        """ Return the last date in which the results will be displayed.
            It is set to be the current date - num of days as set by users.
            Affect only self.print_feeds function.
            Kwargs:
                num_days (int): num of days prior to the current date.
                Setting to 0 will only retrieve the current date
            Returns:
                (int): datekey as yyyyymmdd.
        """
        last_eff_date_list = list((datetime.date.today() - datetime.timedelta(num_days)).timetuple()[0:3])

        if len(str(last_eff_date_list[1])) == 1:
            last_eff_date_list[1] = '0' + str(last_eff_date_list[1])

        return int(str(last_eff_date_list[0]) + last_eff_date_list[1] + str(last_eff_date_list[2]))

    def print_feeds(self, rss_results_dict):
        """ Print the RSS data results. Required the self.rss_results_dict.
            Args:
                rss_results_dict (dict): dict containing date as key and title, desc as value.
        """
        for n in rss_results_dict.keys():
            print 'Results of date: ', n
            dataset = rss_results_dict[n]
            if int(n) >= self.set_last_desired_date():
                print '===='*10
                for title,desc in dataset:
                    print title
                    print desc
                    print '--'*5
                    print

    def print_feeds_for_all_cat(self):
        """ Print feeds for all the category specified by the self.rss_results_dict_by_cat

        """
        for cat in self.rss_results_dict_by_cat:
            print 'Printing Category: ', cat
            self.print_feeds(self.rss_results_dict_by_cat[cat])
            print
            print "####"*18

if __name__ == '__main__':
        f = FeedsReader()
        f.parse_rss_sites_by_cat()
        print '=='*19
        f.print_feeds_for_all_cat()

The results are as followed:

Processing Category: World
processing: http://www.ft.com/rss/home/asia
processing: http://rss.cnn.com/rss/money_news_economy.rss
processing: http://feeds.reuters.com/reuters/businessNews
done
Processing Category: SG
processing: http://feeds.theedgemarkets.com/theedgemarkets/sgtopstories.rss
processing: http://feeds.theedgemarkets.com/theedgemarkets/sgmarkets.rss
processing: http://feeds.theedgemarkets.com/theedgemarkets/sgproperty.rss
done
======================================

Printing Category: World
Results of date: 20150126
Results of date: 20150127
========================================
China seeks end to gold medal fixation
‘Blind pursuit’ of success condemned as sports administrator scraps rewards for victory
———-

Tsipras poised to unveil new Greek cabinet
Athens and international creditors dig in on Greek debt
———-

EU threatens Russia with more sanctions
Call comes as violence in eastern Ukraine escalates
———-

……..

Printing Category: SG
Results of date: 20150127
========================================
Singapore shares higher; ComfortDelGro shines on broker upgrade
SINGAPORE (Jan 27): Gains in most Asian markets helped lift Singapore shares, with much of the buying centred on penny stocks.

Gainers outnumbered decliners 267 to 187, with some 1.84 billion shares worth $1.49 billion shares changin…
———-

Job vacancies in Singapore up 8.9%
SINGAPORE (Jan 27): The number of job vacancies in Singapore swelled to 67,400 in the year to September 2014, from 61,900 the previous year, with the services industry in greatest need of workers.

The bulk of the vacancies was from c…

Advertisement

Rapid input data from list of files to SQLite DB

Suppose you wish to input all the data from a series of CSV files residing in a particular folder to a SQLite database, the following tools and and commands will make it a relatively fast and painless.

Each of the CSV file are assumed to be in the format where the first line is the headers and the subsequent lines are series of data. The headers need not be the same for each CSV file.

The following python modules are required.

  1. CSV module
  2. Pandas (0.15)
  3. SQLite3

The following lines of commands will input execute the following actions.

  1. Use list comprehension to get the list of file path from a particular folder.
  2. Convert the data from each CSV file into dict format using CSV DictReader and iterate over all the files, joining them to a list.
  3. Convert the list of dicts to a single Pandas Dataframe.
  4. Create a connection to a particular SQLite Database.
  5. Use Pandas to_sql() function to pass all the data to SQLite database.
  6. The code is displayed below.
<pre>import os
import csv
import pandas
import sqlite3 as lite

path = r'C:\folderpath\to\list\of\files'
## Get the full path of all the csv files.
full_path_list = [os.path.join(path,f) for\
				 f in os.listdir(path) if os.path.isfile(os.path.join(path,f)) ]

## Convert all data to list of dicts.
full_data_list =  [n for f in full_path_list for n in csv.DictReader(open(f,'r'))]

## SQL database name and initialize the sql connection.
db_filename = r'c:\data\sbrtemp3.db'
con = lite.connect(db_filename)

## Convert to dataframe and write to sql database.
pandas.DataFrame(full_data_list).to_sql('test', con, flavor='sqlite',
				schema=None, if_exists='replace', index=True,
				index_label=None, chunksize=None, dtype=None)

## Close the SQL connection
con.close()

The Pandas to_sql() function is able to handle issues of duplicates and can be called multiple times if users required to add additional data. In addition, converting to a dataframe from a list of dicts also allows the headers to be different for different CSV files. Besides saving to database, user can also choose to consolidate to a single csv file by using Pandas to_csv() function.

Scraping Company info using Yahoo Query Language (YQL) and Python

Additional stock data such as company balance sheets and financial records can be scraped from yahoo finance website as described in the previous post. An alternative way which is much faster can be done using the Yahoo Query Language (YQL) . It provides collections of data  from various sources including Yahoo finance data and enable easy query of specific data sets. The results is generated in the form of json format which itself can be easily retrieved from the url link generated from the YQL query.

The YQL provides a YQL console which provides easy way for users to key in the SQL syntax to query for particular information. For example, to obtain key company statistics such as P/E ratio, cash flow etc. The following SQL can be inputted into the console.

SELECT * FROM yahoo.finance.keystats WHERE symbol in ("N4E.SI","BS6.SI")

Pressing the “Test” button will generate a url that will link to the json file containing all the information. Example of the url string is as below.

https://query.yahooapis.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.finance.keystats%20WHERE%20symbol%20in%20(%22N4E.SI%22%2C%22BS6.SI%22)&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback=

The url can now be used with the PATTERN module to download and processed the json information. For reading json file, simplejson module can be used. The url can be modified easily to include more stock symbols (the text highlighted in blue). For each url, I have included about 45 stocks symbols and loop it for all the stock symbols required. Users can also customize and filter the data using standard SQL syntax.

This method is much faster compared to the direct scraping method described previously as multiple stock symbols can be processed at one go and the json data can be easily retrieved. In contrast, direct scraping can only processed single web page (or stock) at one go and require handling of XPATH to get the data correctly.

The YQL contains 1000 of tables from different websites such as Flickr, wordpress, twitter etc and the data are easily organized in table form for easy retrieval. The url string also provides additional flexibility to query more data set.

The script for this can be easily done using standard url string formation, downloading of raw data using the Pattern module, reading the data using simplejson and converting the data to dataframe using Python Pandas.

One advantage of json file is that it is basically a dict file (of eg 45 stocks symbols) and a list of dict files can be easily transformed to a pandas Data frame for further processing. Below code abstract shows the portion in which the json file is being loaded and converted to a dict to append to a list. This list is in turn convert to Dataframe object by passing the list of dicts to the Dataframe object.

    def get_datalist_fr_json(self):
        """
            Set to self.com_data_allstock_list.
            Will keep appending without any reset.
        """
        raw_data  = json.load(open(self.saved_json_file, 'r'))
        for indivdual_set in  raw_data['query']['results']['stats']:
            temp_dict_data = {}
            if type(indivdual_set) == str:
                #for single data
                continue # temp do not use
            for parameters in indivdual_set.keys():
                if type(indivdual_set[parameters]) == str:
                    temp_dict_data[parameters] = indivdual_set[parameters]#for symbol
                elif type(indivdual_set[parameters]) == dict:
                    if indivdual_set[parameters].has_key('content'):
                        temp_dict_data[parameters] = indivdual_set[parameters]['content']

            ## append to list
            self.com_data_allstock_list.append(temp_dict_data)

    def get_com_data_fr_all_stocks(self):
        """ Cater for situation where there is large list.
            For safeguard, clip limit to 49.
        """
        full_list = self.replace_special_characters_in_list(self.full_stocklist_to_retrieve)
        chunk_of_list = self.break_list_to_sub_list(self.full_stocklist_to_retrieve)

        self.temp_full_data_df = None
        for n in chunk_of_list:
            # print the progress
            sys.stdout.write('.')

            # set the small chunk of list
            self.set_target_stocks_list(n)
            self.get_com_data()

        # convert to dataframe
        self.com_data_allstock_df = pandas.DataFrame(self.com_data_allstock_list)
        self.com_data_allstock_df.rename(columns ={'symbol':'SYMBOL'}, inplace=True)