Kaggle: Zillow’s Zestimate competition: C# Classes for reading source data files – Part1

Kaggle: Zillow’s Zestimate competition just launched this week, and there is $1.2M in prize money.

I wanted to quick get out some C# classes for reading the source data files. Hope this helps you get off the ground faster for this competition.

For the properties_2016.csv file, I am reading records in to this Parcel class:

public class Parcel
{
// parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
public int parcelid;
public float airconditioningtypeid;
public float architecturalstyletypeid;
public float basementsqft;
public float bathroomcnt;
public float bedroomcnt;
public float buildingclasstypeid;
public float buildingqualitytypeid;
public float calculatedbathnbr;
public float decktypeid;
public float finishedfloor1squarefeet;
public float calculatedfinishedsquarefeet;
public float finishedsquarefeet12;
public float finishedsquarefeet13;
public float finishedsquarefeet15;
public float finishedsquarefeet50;
public float finishedsquarefeet6;
public float fips;
public float fireplacecnt;
public float fullbathcnt;
public float garagecarcnt;
public float garagetotalsqft;
public float hashottuborspa;
public float heatingorsystemtypeid;
public float latitude;
public float longitude;
public float lotsizesquarefeet;
public float poolcnt;
public float poolsizesum;
public float pooltypeid10;
public float pooltypeid2;
public float pooltypeid7;
public string propertycountylandusecode;
public float propertylandusetypeid;
public string propertyzoningdesc;
public float rawcensustractandblock;
public float regionidcity;
public float regionidcounty;
public float regionidneighborhood;
public float regionidzip;
public float roomcnt;
public float storytypeid;
public float threequarterbathnbr;
public float typeconstructiontypeid;
public float unitcnt;
public float yardbuildingsqft17;
public float yardbuildingsqft26;
public float yearbuilt;
public float numberofstories;
public float fireplaceflag;
public float structuretaxvaluedollarcnt;
public float taxvaluedollarcnt;
public float assessmentyear;
public float landtaxvaluedollarcnt;
public float taxamount;
public string taxdelinquencyflag;
public float taxdelinquencyyear;
public float censustractandblock;
}

As you can see, I left most of the fields as float type. There are only a few strings that I’ve seen so far, and I left the parcel ID as an integer.

This file is about 600 MB, so I’m going to load it all up in to a dictionary with parcel ID as the key.

Dictionary<int, Parcel> parcelMap = new Dictionary<int, Parcel>();

Lastly, here’s my code to open the parcel source data file and populate the fields for each record:

public Dictionary<int, Parcel> GetParcelMap(string sourceReport)
{
var fileReader = new System.IO.StreamReader(sourceReport);
// Burn column headers
string line = fileReader.ReadLine();
string[] fields = line.Split(',');
sourceIndex = 0;
Dictionary<int, Parcel> output = new Dictionary<int, Parcel>();
while (!fileReader.EndOfStream)
{
try
{
//Processing row
string line = fileReader.ReadLine();
string[] fields = line.Split(',');
Parcel row = new Parcel();
int.TryParse(fields[0], out row.parcelid);
float.TryParse(fields[1], out row.airconditioningtypeid);
float.TryParse(fields[2], out row.architecturalstyletypeid);
float.TryParse(fields[3], out row.basementsqft);
float.TryParse(fields[4], out row.bathroomcnt);
float.TryParse(fields[5], out row.bedroomcnt);
float.TryParse(fields[6], out row.buildingclasstypeid);
float.TryParse(fields[7], out row.buildingqualitytypeid);
float.TryParse(fields[8], out row.calculatedbathnbr);
float.TryParse(fields[9], out row.decktypeid);
float.TryParse(fields[10], out row.finishedfloor1squarefeet);
float.TryParse(fields[11], out row.calculatedfinishedsquarefeet);
float.TryParse(fields[12], out row.finishedsquarefeet12);
float.TryParse(fields[13], out row.finishedsquarefeet13);
float.TryParse(fields[14], out row.finishedsquarefeet15);
float.TryParse(fields[15], out row.finishedsquarefeet50);
float.TryParse(fields[16], out row.finishedsquarefeet6);
float.TryParse(fields[17], out row.fips);
float.TryParse(fields[18], out row.fireplacecnt);
float.TryParse(fields[19], out row.fullbathcnt);
float.TryParse(fields[20], out row.garagecarcnt);
float.TryParse(fields[21], out row.garagetotalsqft);
float.TryParse(fields[22], out row.hashottuborspa);
float.TryParse(fields[23], out row.heatingorsystemtypeid);
float.TryParse(fields[24], out row.latitude);
float.TryParse(fields[25], out row.longitude);
float.TryParse(fields[26], out row.lotsizesquarefeet);
float.TryParse(fields[27], out row.poolcnt);
float.TryParse(fields[28], out row.poolsizesum);
float.TryParse(fields[29], out row.pooltypeid10);
float.TryParse(fields[30], out row.pooltypeid2);
float.TryParse(fields[31], out row.pooltypeid7);
row.propertycountylandusecode = fields[32];
float.TryParse(fields[33], out row.propertylandusetypeid);
row.propertyzoningdesc = fields[34];
float.TryParse(fields[35], out row.rawcensustractandblock);
float.TryParse(fields[36], out row.regionidcity);
float.TryParse(fields[37], out row.regionidcounty);
float.TryParse(fields[38], out row.regionidneighborhood);
float.TryParse(fields[39], out row.regionidzip);
float.TryParse(fields[40], out row.roomcnt);
float.TryParse(fields[41], out row.storytypeid);
float.TryParse(fields[42], out row.threequarterbathnbr);
float.TryParse(fields[43], out row.typeconstructiontypeid);
float.TryParse(fields[44], out row.unitcnt);
float.TryParse(fields[45], out row.yardbuildingsqft17);
float.TryParse(fields[46], out row.yardbuildingsqft26);
float.TryParse(fields[47], out row.yearbuilt);
float.TryParse(fields[48], out row.numberofstories);
float.TryParse(fields[49], out row.fireplaceflag);
float.TryParse(fields[50], out row.structuretaxvaluedollarcnt);
float.TryParse(fields[51], out row.taxvaluedollarcnt);
float.TryParse(fields[52], out row.assessmentyear);
float.TryParse(fields[53], out row.landtaxvaluedollarcnt);
float.TryParse(fields[54], out row.taxamount);
row.taxdelinquencyflag = fields[55];
float.TryParse(fields[56], out row.taxdelinquencyyear);
float.TryParse(fields[57], out row.censustractandblock);
output.Add(row.parcelid, row);
sourceIndex++;
}
catch (Exception e)
{
Console.WriteLine("ERROR: GetParcelMap failed for line: " + sourceIndex + " with exception: " + e.Message + " Stack: " + e.StackTrace);
sourceIndex++;
}
}
return output;
}

I don’t have a prediction result yet, and I’m working on that next. I’ll be sure to post my solution when it becomes available.

Fast Persistent Key Value Pairs in C# with LevelDb

Let’s say we want to crawl the internet, but we don’t want to request any given URL more than once. We need to have a collection of URL keys that we can look up. It would be nice if we could have key-value pairs, so that we can give URL keys a value in case we change our minds and want to allow URL request updates every X days. We want it to handle billions of records and be really fast (and free). This article will show how to accomplish that using LevelDb and its C# wrapper.


First, start a Visual Studio C# project and download the LevelDb.Net nuget package. There are a few different one, but this is my favorite. 


You can also find this LevelDb.Net at this Github location:

https://github.com/AntShares/leveldb

First, I’m going to show how to use LevelDb via C#. Later in this article, code shows how to insert and select a large number of records for speed testing.

Let’s create a LevelDb:

            Options levelDbOptions = new Options();
            levelDbOptions.CreateIfMissing = true;
            LevelDB.DB levelDb = LevelDB.DB.Open("myLevelDb.dat", levelDbOptions);

Next, we’ll insert some keys:

            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value1");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value2");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value3");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key2", "Value2");

Then, we’ll select some keys:

            LevelDB.Slice outputValue;
            if (levelDb.TryGet(LevelDB.ReadOptions.Default, "Key2", out outputValue))
            {
                Console.WriteLine("Key2: Value = " + outputValue.ToString());// Expect: Value2
            }
            if (levelDb.TryGet(LevelDB.ReadOptions.Default, "Key1", out outputValue))
            {
                Console.WriteLine("Key1: Value = " + outputValue.ToString()); // Expect: Value3
            }
            if (!levelDb.TryGet(LevelDB.ReadOptions.Default, "KeyXYZ", out outputValue))
            {
                Console.WriteLine("KeyXYZ: NOT FOUND.");
            }

LevelDb supports many different types of keys and values (strings, int, float, byte[], etc…).

  1. Open instance handle.
  2. Insert = Put
  3. Select = TryGet

That’s it! 

But, how fast is it?

Let’s build a collection of MD5 hash keys and insert them:

            List<string> seedHashes = new List<string>();
            for (int idx = 0; idx < 500000; idx++)
            {
                byte[] encodedPassword = new UTF8Encoding().GetBytes(idx.ToString());
                byte[] hash = ((HashAlgorithm)CryptoConfig.CreateFromName("MD5")).ComputeHash(encodedPassword);
                string encoded = BitConverter.ToString(hash).Replace("-", string.Empty).ToLower();
                seedHashes.Add(encoded);
            }

            // Start Insert Speed Tests
            Stopwatch stopWatch = new Stopwatch();
            stopWatch.Start();
            foreach(var key in seedHashes)
            {
                levelDb.Put(LevelDB.WriteOptions.Default, key, "1");
            }
            stopWatch.Stop();
            Console.WriteLine("LevelDb Inserts took (ms) " + stopWatch.ElapsedMilliseconds);

Next, let’s select each of the keys we just inserted several times:

            // Start Lookup Speed Tests
            stopWatch.Start();
            for (int loopIndex = 0; loopIndex < 10; loopIndex++)
            {
                for(int seedIndex = 0; seedIndex < seedHashes.Count; seedIndex++)
                {
                    if (!levelDb.TryGet(LevelDB.ReadOptions.Default, seedHashes[seedIndex], out outputValue))
                    {
                        Console.WriteLine("ERROR: Key Not Found: " + seedHashes[seedIndex]);
                    }
                }
            }
            stopWatch.Stop();
            Console.WriteLine("LevelDb Lookups took (ms) " + stopWatch.ElapsedMilliseconds);
On my junky 4 year old desktop, 500,000 inserts took just under 60 seconds and 5 Million selects took just over 2 minutes. Here’s the program output:

The complete code sample is below:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using LevelDB;
using System.Security.Cryptography;
using System.Diagnostics;

namespace LevelDbExample
{
    class Program
    {
        static void Main(string[] args)
        {

            Options levelDbOptions = new Options();
            levelDbOptions.CreateIfMissing = true;
            LevelDB.DB levelDb = LevelDB.DB.Open("myLevelDb.dat", levelDbOptions);

            // Insert some records
            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value1");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value2");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key1", "Value3");
            levelDb.Put(LevelDB.WriteOptions.Default, "Key2", "Value2");

            // Select some records
            LevelDB.Slice outputValue;
            if (levelDb.TryGet(LevelDB.ReadOptions.Default, "Key2", out outputValue))
            {
                Console.WriteLine("Key2: Value = " + outputValue.ToString());// Expect: Value2
            }
            if (levelDb.TryGet(LevelDB.ReadOptions.Default, "Key1", out outputValue))
            {
                Console.WriteLine("Key1: Value = " + outputValue.ToString()); // Expect: Value3
            }
            if (!levelDb.TryGet(LevelDB.ReadOptions.Default, "KeyXYZ", out outputValue))
            {
                Console.WriteLine("KeyXYZ: NOT FOUND.");
            }

            // Build a collection of hash keys
            List<string> seedHashes = new List<string>();
            for (int idx = 0; idx < 500000; idx++)
            {
                byte[] encodedPassword = new UTF8Encoding().GetBytes(idx.ToString());
                byte[] hash = ((HashAlgorithm)CryptoConfig.CreateFromName("MD5")).ComputeHash(encodedPassword);
                string encoded = BitConverter.ToString(hash).Replace("-", string.Empty).ToLower();
                seedHashes.Add(encoded);
            }

            // Start Insert Speed Tests
            Stopwatch stopWatch = new Stopwatch();
            stopWatch.Start();
            foreach(var key in seedHashes)
            {
                levelDb.Put(LevelDB.WriteOptions.Default, key, "1");
            }
            stopWatch.Stop();
            Console.WriteLine("LevelDb Inserts took (ms) " + stopWatch.ElapsedMilliseconds);

            // Start Lookup Speed Tests
            stopWatch.Start();
            for (int loopIndex = 0; loopIndex < 10; loopIndex++)
            {
                for(int seedIndex = 0; seedIndex < seedHashes.Count; seedIndex++)
                {
                    if (!levelDb.TryGet(LevelDB.ReadOptions.Default, seedHashes[seedIndex], out outputValue))
                    {
                        Console.WriteLine("ERROR: Key Not Found: " + seedHashes[seedIndex]);
                    }
                }
            }
            stopWatch.Stop();
            Console.WriteLine("LevelDb Lookups took (ms) " + stopWatch.ElapsedMilliseconds);

            return;
        }
    }
}