# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to restrict the float value to 3 decimal places
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# let colab access my google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Load data from csv file
data = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_One_-_Python_for_Data_Science/Optional Material/Melbourne_Housing.csv')

data.head()

data.tail()

# checking shape of the data
print("There are", data.shape[0], 'rows and', data.shape[1], "columns.")

There are 27114 rows and 16 columns.

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27114 entries, 0 to 27113
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         27114 non-null  object 
 1   Rooms          27114 non-null  int64  
 2   Type           27114 non-null  object 
 3   SellerG        27114 non-null  object 
 4   Date           27114 non-null  object 
 5   Distance       27113 non-null  float64
 6   Postcode       27113 non-null  float64
 7   Bedroom        20678 non-null  float64
 8   Bathroom       20672 non-null  float64
 9   Car            20297 non-null  float64
 10  Landsize       17873 non-null  float64
 11  BuildingArea   10543 non-null  object 
 12  YearBuilt      11985 non-null  float64
 13  Regionname     27114 non-null  object 
 14  Propertycount  27114 non-null  int64  
 15  Price          27114 non-null  int64  
dtypes: float64(7), int64(3), object(6)
memory usage: 3.3+ MB

# changing the data type of Date column
data['Date'] = pd.to_datetime(data['Date'], format = "%d-%m-%Y")

# let's see why BuildingArea column has object data type
data['BuildingArea'].unique()

array(['225', '82', 'inf', '263', '242', '251', '117', 'missing', '76',
       '399', '118', '103', '180', '123', '218', '129', '167', '154',
       '275', '121', nan, '125', '255', '75', '156', '240', '268', '108',
       '69', '140', '214', '253', '189', '215', '96', '104', '100', '313',
       '144', '93', '110', '70', '122', '51', '147', '113', '83', '56',
       '137', '85', '64', '175', '3558', '170', '265', '353', '138', '19',
       '116', '87', '74', '320', '300', '210', '120', '86', '97', '200',
       '106', '14', '161', '128', '185', '146', '133', '115', '143',
       '150', '195', '236', '276', '188', '179', '249', '141', '34', '73',
       '107', '84', '81', '207', '50', '264', '312', '235', '221', '183',
       '132', '160', '186', '78', '105', '145', '62', '220', '315', '181',
       '61', '112', '420', '226', '266', '410', '449', '356', '477',
       '250', '95', '190', '284', '247', '213', '209', '119', '111',
       '130', '348', '166', '44', '176', '98', '159', '79', '71', '60',
       '33', '89', '217', '127', '187', '109', '53', '201', '12', '63',
       '223', '102', '254', '327', '16', '165', '65', '139', '134', '280',
       '67', '272', '38', '153', '66', '152', '135', '374', '163', '124',
       '233', '39', '151', '279', '136', '405', '199', '158', '126', '80',
       '57', '36', '142', '77', '435', '92', '149', '114', '349', '178',
       '594', '274', '40', '68', '164', '204', '94', '257', '323', '314',
       '378', '55', '91', '309', '88', '58', '172', '211', '148', '49',
       '52', '174', '191', '335', '808', '168', '203', '520', '212',
       '222', '171', '228', '101', '267', '90', '196', '157', '99', '256',
       '15', '155', '162', '72', '252', '330', '177', '341', '197', '340',
       '182', '245', '270', '42', '229', '232', '131', '297', '237',
       '194', '173', '46', '360', '205', '45', '365', '321', '37', '304',
       '54', '305', '244', '43', '41', '22', '169', '59', '258', '230',
       '287', '618', '792', '355', '202', '395', '351', '325', '248',
       '224', '426', '286', '308', '198', '260', '291', '277', '281',
       '216', '262', '364', '30', '192', '47', '184', '25', '558', '362',
       '375', '347', '241', '475', '413', '400', '101.37', '219', '239',
       '123.21', '140.7481', '86.5', '30.6', '225.5', '322', '409.54',
       '26', '156.6', '227.6', '104.63', '193', '443', '167.13', '148.5',
       '270.18', '53.3', '98.5', '157.9351', '328', '94.3', '21', '310',
       '95.88', '261', '105.7', '72.9', '45.4', '221.3', '113.2',
       '186.75', '540', '127.8', '231', '106.4', '311', '234', '332',
       '206', '259', '246', '296', '269', '306', '380', '302', '48',
       '278', '35', '208', '467', '283', '458', '196.8', '106.2', '32',
       '424', '430', '243', '290', '511', '500', '531', '508', '333',
       '72.3', '66.23', '101.51', '292', '112.9', '99.5', '77.5', '293',
       '105.23', '352', '391', '121.8', '82.6', '136.49', '502', '680',
       '157.9352', '126.8', '100.6', '44.4', '151.54', '20', '397', '495',
       '331', '294', '334', '303', '318', '273', '584', '958', '288',
       '342', '354', '350', '465', '337', '317', '298', '529', '697',
       '289', '602', '6791', '534', '227', '429', '802', '373', '358',
       '238', '316', '393', '180.9', '826.8367', '366', '307', '625',
       '419', '94.5', '392', '28', '116.65', '11', '134.6', '282',
       '183.84', '431', '126.7', '18', '295', '390', '447', '271', '416',
       '370', '660', '463', '361', '496', '324', '173.45', '134.3',
       '464.3', '76.77', '113.76', '107.4', '255.79', '186.36', '490',
       '103.6', '63.7', '106.76', '39.5', '423', '319', '437', '285',
       '453', '472', '418', '487', '31', '301', '116.4', '182.85',
       '109.5', '129.7', '665', '398', '148.47', '553', '182.9', '184.75',
       '193.74', '68.5', '336', '346', '719', '3112', '414', '372', '525',
       '367', '1561', '454', '720', '329', '381', '677', '439', '857',
       '23', '503', '432', '377', '512', '425', '406', '389', '521',
       '6178', '654', '727', '417', '560', '168.01', '306.19', '515.78',
       '196.1', '789', '131.27', '167.87', '183.97', '199.5', '29', '446',
       '75.45', '165.5', '263.5', '585', '516', '345', '27', '444', '427',
       '326', '518', '13', '411', '1022', '339', '461', '376', '513',
       '836', '464', '386', '603', '613', '179.3', '142.6', '147.2',
       '225.98', '113.81', '401', '737', '530', '82.3', '110.87', '450',
       '673', '109.98', '122.25', '105.9', '101.7', '106.9', '93.82',
       '132.6', '154.3', '178.74', '88.3', '80.7', '89.25', '181.6',
       '104.4', '180.56', '123.5', '1143', '506', '575', '859', '653',
       '999', '445', '438', '686', '649', '736', '371.6122', '199.73',
       '118.54', '298.21', '169.5', '42.2', '135.5', '266.76', '272.4',
       '177.8', '139.4', '700', '766', '538', '528', '650', '368', '544',
       '557', '396', '638', '101.76', '363', '634', '195.0964', '470',
       '130.52', '547', '111.04', '63.4', '92.5', '115.96', '121.84',
       '122.86', '114.2', '266.53', '78.54', '85.35', '1041', '607',
       '478', '407', '344', '338', '501', '448', '357', '408', '412',
       '466', '739', '614', '421', '385', '68.11', '153.1', '85.2',
       '93.84', '124.45', '210.68', '200.71', '81.79', '934', '129.92',
       '43.7', '113.6', '66.32', '35.64', '61.6', '388.5', '672'],
      dtype=object)

# checking the count of different data types in BuildingArea column
data['BuildingArea'].apply(type).value_counts()

# replacing values with nan
data['BuildingArea'] = data['BuildingArea'].replace(['missing','inf'],np.nan)

# changing the data type to float
data['BuildingArea'] = data['BuildingArea'].astype(float)

# let's check the data type of columns again
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27114 entries, 0 to 27113
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Suburb         27114 non-null  object        
 1   Rooms          27114 non-null  int64         
 2   Type           27114 non-null  object        
 3   SellerG        27114 non-null  object        
 4   Date           27114 non-null  datetime64[ns]
 5   Distance       27113 non-null  float64       
 6   Postcode       27113 non-null  float64       
 7   Bedroom        20678 non-null  float64       
 8   Bathroom       20672 non-null  float64       
 9   Car            20297 non-null  float64       
 10  Landsize       17873 non-null  float64       
 11  BuildingArea   10529 non-null  float64       
 12  YearBuilt      11985 non-null  float64       
 13  Regionname     27114 non-null  object        
 14  Propertycount  27114 non-null  int64         
 15  Price          27114 non-null  int64         
dtypes: datetime64[ns](1), float64(8), int64(3), object(4)
memory usage: 3.3+ MB

# using na_values to tell python which values it should consider as NaN
data_new = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_One_-_Python_for_Data_Science/Optional Material/Melbourne_Housing.csv',na_values=['missing','inf'])

data_new['BuildingArea'].dtype

dtype('float64')

data.isnull().sum()

data.duplicated().sum()

11

# dropping duplicate entries from the data
data.drop_duplicates(inplace=True)

# resetting the index of data frame since some rows will be removed
data.reset_index(drop=True,inplace=True)

data.describe().T

# let's check the total number of unique values in the Postcode column
data['Postcode'].nunique()

209

# Making a list of all categorical variables
cat_cols =  ['Suburb', 'Type', 'SellerG', 'Regionname']

# Printing the count of unique categorical levels in each column
for column in cat_cols:
    print(data[column].value_counts())
    print("-" * 50)

Suburb
Reservoir         724
Bentleigh East    493
Richmond          437
Preston           410
Brunswick         383
                 ... 
Wandin North        1
Ferny Creek         1
Tecoma              1
Montrose            1
viewbank            1
Name: count, Length: 345, dtype: int64
--------------------------------------------------
Type
h    18394
u     5882
t     2827
Name: count, dtype: int64
--------------------------------------------------
SellerG
Nelson           2733
Jellis           2516
Barry            2385
hockingstuart    2096
Ray              1574
                 ... 
Allan               1
Jim                 1
iProperty           1
Batty               1
Icon                1
Name: count, Length: 347, dtype: int64
--------------------------------------------------
Regionname
Southern Metropolitan         8480
Northern Metropolitan         7809
Western Metropolitan          5792
Eastern Metropolitan          3264
South-Eastern Metropolitan    1336
Northern Victoria              165
Eastern Victoria               163
Western Victoria                94
Name: count, dtype: int64
--------------------------------------------------

# Printing the percentage of unique categorical levels in each column
for column in cat_cols:
    print(data[column].value_counts(normalize=True))
    print("-" * 50)

Suburb
Reservoir        0.027
Bentleigh East   0.018
Richmond         0.016
Preston          0.015
Brunswick        0.014
                  ... 
Wandin North     0.000
Ferny Creek      0.000
Tecoma           0.000
Montrose         0.000
viewbank         0.000
Name: proportion, Length: 345, dtype: float64
--------------------------------------------------
Type
h   0.679
u   0.217
t   0.104
Name: proportion, dtype: float64
--------------------------------------------------
SellerG
Nelson          0.101
Jellis          0.093
Barry           0.088
hockingstuart   0.077
Ray             0.058
                 ... 
Allan           0.000
Jim             0.000
iProperty       0.000
Batty           0.000
Icon            0.000
Name: proportion, Length: 347, dtype: float64
--------------------------------------------------
Regionname
Southern Metropolitan        0.313
Northern Metropolitan        0.288
Western Metropolitan         0.214
Eastern Metropolitan         0.120
South-Eastern Metropolitan   0.049
Northern Victoria            0.006
Eastern Victoria             0.006
Western Victoria             0.003
Name: proportion, dtype: float64
--------------------------------------------------

sns.histplot(data=data,x='Distance',stat='density')
plt.show()
sns.boxplot(data=data,x='Distance')
plt.show()

sns.displot(data=data,x='Landsize',kind='kde')
plt.show()
sns.boxplot(data=data,x='Landsize')
plt.show()

# converting Landsize to sq. kilometres from sq. metres
sns.displot(data=data,x=data['Landsize']/1000000,kind='kde')
plt.show()
sns.boxplot(data=data,x=data['Landsize']/1000000)
plt.show()

data.loc[data['Landsize']>60000]

sns.displot(data=data,x='BuildingArea',kind='kde')
plt.show()
sns.boxplot(data=data,x='BuildingArea')
plt.show()

sns.displot(data=data,x=data['BuildingArea'],kind='kde')
plt.show()
sns.boxplot(data=data,x=data['BuildingArea'])
plt.show()

sns.histplot(data=data,x='Price')
plt.show()
sns.boxplot(data=data,x='Price')
plt.show()

sns.boxplot(data=data,x='Rooms')
plt.show()

data.loc[data['Rooms']>7].shape

(23, 16)

# findig the type of such properties
data.loc[data['Rooms']>7,'Type'].value_counts()

Type
h    21
u     2
Name: count, dtype: int64

sns.countplot(data=data,x='Regionname')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(10,5))
sns.heatmap(data.corr(numeric_only = True),annot=True,cmap='Spectral',vmin=-1,vmax=1)
plt.show()

# let's create a column with a sum of number of rooms, bedrooms, bathrooms, and car parking spaces
data['Total Space'] = data['Rooms'] + data['Bedroom'] + data['Bathroom'] + data['Car']
data.head()

plt.figure(figsize=(10,5))
sns.scatterplot(data=data,x='Total Space',y='Price')
plt.show()

sns.lmplot(data=data,x='Total Space',y='Price',height=5,aspect=2)
plt.xlim(0,55)
plt.show()

# lets check the correlation between Total space and Price
data[['Total Space','Price']].corr()

plt.figure(figsize=(15,7))
sns.scatterplot(data=data, x='Distance', y ='Price')
plt.show()

plt.figure(figsize=(15,7))
sns.lineplot(data=data, x='Distance', y ='Price',ci=None)
plt.show()

<ipython-input-38-4f120fe1b3cd>:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=data, x='Distance', y ='Price',ci=None)

# using pd.cut() function to create bins
data['Distance_bins'] = pd.cut(data['Distance'],bins=[0,15,30,50],labels=['Nearby','Moderately Close','Far'], right = False)

data.head()

sns.boxplot(data=data,x='Distance_bins',y='Price')
plt.show()

sns.boxplot(data=data,x='Distance_bins',y='Price',showfliers=False) # showfliers parameter controls the representation of outliers in the boxplot
plt.show()

# let's first calculate the age of a property from the year it was built in to see how the prices vary with it
year_at_sale = data['Date'].dt.year
year_at_sale

np.max(year_at_sale)

2018

data['AgeofProp'] = year_at_sale - data['YearBuilt']
data.head()

data[data['AgeofProp']==-2]

plt.figure(figsize=(15,5))
sns.lineplot(data=data,x='AgeofProp',y='Price',ci=None)
plt.show()

<ipython-input-47-ab33d0095f5c>:2: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.lineplot(data=data,x='AgeofProp',y='Price',ci=None)

sns.relplot(data=data,x='AgeofProp',y='Price',col='Regionname',kind='line', errorbar=None, col_wrap=4)
plt.show()
# double click on the plot to zoom in

plt.figure(figsize=(10,5))
sns.boxplot(data=data,x='Type',y='Price',showfliers=False) # turning off outliers
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(data=data,x='Regionname',y='Price',showfliers=False) # turning off outliers
plt.xticks(rotation=90)
plt.show()

# Dispersion of price in every region
sns.catplot(x='Price',
            col='Regionname',
            data=data,
            col_wrap=4,
            kind="violin")
plt.show()

# data.shape[0] will give us the number of rows in the dataset
# selecting the instances where missing value is greater than 0
pd.DataFrame({'Count':data.isnull().sum()[data.isnull().sum()>0],'Percentage':(data.isnull().sum()[data.isnull().sum()>0]/data.shape[0])*100})

# extracting all the information of other variables where Distance is null
data.loc[data['Distance'].isnull()==True]

data.loc[data['Suburb']=='Fawkner Lot']

# adding the index value of the row in data.drop() function
data = data.drop(9590).reset_index(drop=True)

pd.DataFrame({'Count':data.isnull().sum()[data.isnull().sum()>0],'Percentage':(data.isnull().sum()[data.isnull().sum()>0]/data.shape[0])*100})

# extracting all the information of other variable where Bedroom is null
data.loc[data['Bedroom'].isnull()==True]

data.loc[data['Bedroom'].isnull()==True,'Bathroom'].value_counts(dropna=False)

data.loc[data['Bedroom'].isnull()==True,'Car'].value_counts(dropna=False)

data.loc[data['Bedroom'].isnull()==True,'Landsize'].value_counts(dropna=False)

data.loc[data['Bedroom'].isnull()==True,'BuildingArea'].value_counts(dropna=False)

data.loc[data['Bedroom'].isnull()==True,'YearBuilt'].value_counts(dropna=False)

data.loc[data['Bedroom'].isnull()==True,'Suburb'].value_counts(dropna=False)

# to find the total number of unique values in a suburb
data['Suburb'].nunique()

344

data.loc[data['Bedroom'].isnull()==True,'Regionname'].value_counts(dropna=False)

# checking the average number of bedrooms, bathrooms, and car parking spaces in a region
data.groupby(['Regionname','Type'])[['Bedroom','Bathroom','Car']].mean()

# imputing missing values in Bedroom column
data['Bedroom'] = data['Bedroom'].fillna(value = data.groupby(['Regionname','Type'])['Bedroom'].transform('mean'))

# imputing missing values in Bathroom column
data['Bathroom'] = data['Bathroom'].fillna(value = data.groupby(['Regionname','Type'])['Bathroom'].transform('mean'))

# imputing missing values in Car column
data['Car'] = data['Car'].fillna(value = data.groupby(['Regionname','Type'])['Car'].transform('mean'))

# checking if all the missing values were imputed in Bedroom, Bathroom, and Car columns
pd.DataFrame({'Count':data.isnull().sum()[data.isnull().sum()>0],'Percentage':(data.isnull().sum()[data.isnull().sum()>0]/data.shape[0])*100})

data['Bedroom'] = data['Bedroom'].astype(int)
data['Bathroom'] = data['Bathroom'].astype(int)
data['Car'] = data['Car'].astype(int)

# removing Total Space column
data.drop('Total Space',axis=1,inplace=True)

# creating new Total Space column
data['Total_Space_New'] = data['Rooms'] + data['Bedroom'] + data['Bathroom'] + data['Car']
data['Total_Space_New'] = data['Total_Space_New'].astype(int)

sns.scatterplot(data=data,x='Total_Space_New',y='Price')
plt.show()

sns.displot(data=data,x='Landsize',kind='kde')
plt.show()

data.groupby(['Regionname','Type'])[['Landsize']].median()

# grouping data on region and type of property
# finding the median of landsize for each group and imputing the missing data with it
data['Landsize'] = data['Landsize'].fillna(value = data.groupby(['Regionname','Type'])['Landsize'].transform('median'))

# checking if all the missing values were imputed in Landsize column
pd.DataFrame({'Count':data.isnull().sum()[data.isnull().sum()>0],'Percentage':(data.isnull().sum()[data.isnull().sum()>0]/data.shape[0])*100})

# Taking option 1 to remove the 61% null values from the building area column
new_data = data[data['BuildingArea'].notnull()]
new_data.head()

# Option 2 -  dropping columns BuildingArea, YearBuilt, and AgeofProp from the data frame
data = data.drop(['BuildingArea','YearBuilt','AgeofProp'],axis=1)

# saving the dataset with all the missing values treated
data.to_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_One_-_Python_for_Data_Science/Optional Material/Melbourne_Housing_NoMissing.csv',index=False)

from IPython.display import Image
Image("/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_One_-_Python_for_Data_Science/Optional Material/Box Plot_OC.png")

# reading the dataset
data = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_One_-_Python_for_Data_Science/Optional Material/Melbourne_Housing.csv')
data.head()

# outlier detection using boxplot
# selecting the numerical columns of data and adding their names in a list
numeric_columns = ['Rooms', 'Distance', 'Postcode', 'Bedroom', 'Bathroom', 'Car','Landsize',
                   'Propertycount', 'Price']
plt.figure(figsize=(15, 12))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

# to find the 25th percentile and 75th percentile for the numerical columns.
Q1 = data[numeric_columns].quantile(0.25)
Q3 = data[numeric_columns].quantile(0.75)

IQR = Q3 - Q1                   #Inter Quantile Range (75th percentile - 25th percentile)

lower_whisker = Q1 - 1.5*IQR    #Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper_whisker = Q3 + 1.5*IQR

# Percentage of outliers in each column
((data[numeric_columns] < lower_whisker) | (data[numeric_columns] > upper_whisker)).sum()/data.shape[0]*100

def treat_outliers(df, col):
    """
    treats outliers in a variable
    col: str, name of the numerical variable
    df: dataframe
    col: name of the column
    """
    Q1 = df[col].quantile(0.25)  # 25th quantile
    Q3 = df[col].quantile(0.75)  # 75th quantile
    IQR = Q3 - Q1                # Inter Quantile Range (75th perentile - 25th percentile)
    lower_whisker = Q1 - 1.5 * IQR
    upper_whisker = Q3 + 1.5 * IQR

    # all the values smaller than lower_whisker will be assigned the value of lower_whisker
    # all the values greater than upper_whisker will be assigned the value of upper_whisker
    # the assignment will be done by using the clip function of NumPy
    df[col] = np.clip(df[col], lower_whisker, upper_whisker)

    return df

data = treat_outliers(data,'Rooms')

# visualizing the column after outlier treatment
sns.boxplot(data=data,x='Rooms')
plt.show()

# treating outliers of Bedroom column
data = treat_outliers(data,'Bedroom')

# treating outliers of Car column
data = treat_outliers(data,'Car')

# treating outliers of Bathroom column
data = treat_outliers(data,'Bathroom')

# treating outliers of Landsize column
data = treat_outliers(data,'Landsize')

# outlier detection using boxplot
# selecting the numerical columns where outliers were treated
numeric_columns = ['Rooms', 'Bedroom', 'Bathroom', 'Car','Landsize']
plt.figure(figsize=(15, 12))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

# Imputing missing values in Bedroom column using median instead of mean
data['Bedroom'] = data['Bedroom'].fillna(value = data.groupby(['Regionname','Type'])['Bedroom'].transform('median'))

# Imputing missing values in Bathroom column using median instead of mean
data['Bathroom'] = data['Bathroom'].fillna(value = data.groupby(['Regionname','Type'])['Bathroom'].transform('median'))

# Imputing missing values in Car column using median instead of mean
data['Car'] = data['Car'].fillna(value = data.groupby(['Regionname','Type'])['Car'].transform('median'))

# Convert the columns to integers after ensuring no NaN values
data['Bedroom'] = data['Bedroom'].astype(int)
data['Bathroom'] = data['Bathroom'].astype(int)
data['Car'] = data['Car'].astype(int)

# saving the dataset with all the outlier values treated
data.to_csv('/content/drive/MyDrive/Python Course/Melbourne_Housing_NoOutliers.csv',index=False)

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price
0	Airport West	3	t	Nelson	03-09-2016	13.500	3042.000	3.000	2.000	1.000	303.000	225	2016.000	Western Metropolitan	3464	840000
1	Albert Park	2	h	hockingstuart	03-09-2016	3.300	3206.000	2.000	1.000	0.000	120.000	82	1900.000	Southern Metropolitan	3280	1275000
2	Albert Park	2	h	Thomson	03-09-2016	3.300	3206.000	2.000	1.000	0.000	159.000	inf	NaN	Southern Metropolitan	3280	1455000
3	Alphington	4	h	Brace	03-09-2016	6.400	3078.000	3.000	2.000	4.000	853.000	263	1930.000	Northern Metropolitan	2211	2000000
4	Alphington	3	h	Jellis	03-09-2016	6.400	3078.000	3.000	2.000	2.000	208.000	inf	2013.000	Northern Metropolitan	2211	1110000

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price
27109	Noble Park	3	h	C21	30-09-2017	22.700	3174.000	3.000	1.000	6.000	569.000	130	1959.000	South-Eastern Metropolitan	11806	627500
27110	Reservoir	3	u	RW	30-09-2017	12.000	3073.000	3.000	1.000	1.000	NaN	105	1990.000	Northern Metropolitan	21650	475000
27111	Roxburgh Park	4	h	Raine	30-09-2017	20.600	3064.000	4.000	2.000	2.000	NaN	225	1995.000	Northern Metropolitan	5833	591000
27112	Springvale South	3	h	Harcourts	30-09-2017	22.200	3172.000	3.000	2.000	1.000	544.000	NaN	NaN	South-Eastern Metropolitan	4054	780500
27113	Westmeadows	4	h	Barry	30-09-2017	16.500	3049.000	4.000	2.000	6.000	813.000	140	1960.000	Northern Metropolitan	2474	791000

	count	mean	min	25%	50%	75%	max	std
Rooms	27103.000	2.992	1.000	2.000	3.000	4.000	16.000	0.955
Date	27103	2017-05-23 12:25:09.441759488	2016-01-28 00:00:00	2016-11-19 00:00:00	2017-07-08 00:00:00	2017-10-28 00:00:00	2018-03-17 00:00:00	NaN
Distance	27102.000	11.280	0.000	6.400	10.500	14.000	48.100	6.784
Postcode	27102.000	3113.787	3000.000	3046.000	3088.000	3153.000	3978.000	111.129
Bedroom	20678.000	3.046	0.000	2.000	3.000	4.000	20.000	0.955
Bathroom	20672.000	1.592	0.000	1.000	1.000	2.000	9.000	0.701
Car	20297.000	1.716	0.000	1.000	2.000	2.000	18.000	0.994
Landsize	17873.000	560.537	50.000	220.000	513.000	664.000	76000.000	1411.309
BuildingArea	10529.000	154.512	11.000	101.000	133.000	183.000	6791.000	130.584
YearBuilt	11985.000	1966.618	1850.000	1950.000	1970.000	2000.000	2019.000	36.042
Propertycount	27103.000	7564.741	83.000	4294.000	6567.000	10412.000	21650.000	4494.028
Price	27103.000	1050664.131	85000.000	635000.000	871000.000	1300000.000	11200000.000	641660.161

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price	Total Space
0	Airport West	3	t	Nelson	2016-09-03	13.500	3042.000	3.000	2.000	1.000	303.000	225.000	2016.000	Western Metropolitan	3464	840000	9.000
1	Albert Park	2	h	hockingstuart	2016-09-03	3.300	3206.000	2.000	1.000	0.000	120.000	82.000	1900.000	Southern Metropolitan	3280	1275000	5.000
2	Albert Park	2	h	Thomson	2016-09-03	3.300	3206.000	2.000	1.000	0.000	159.000	NaN	NaN	Southern Metropolitan	3280	1455000	5.000
3	Alphington	4	h	Brace	2016-09-03	6.400	3078.000	3.000	2.000	4.000	853.000	263.000	1930.000	Northern Metropolitan	2211	2000000	13.000
4	Alphington	3	h	Jellis	2016-09-03	6.400	3078.000	3.000	2.000	2.000	208.000	NaN	2013.000	Northern Metropolitan	2211	1110000	10.000

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price	Total Space	Distance_bins
0	Airport West	3	t	Nelson	2016-09-03	13.500	3042.000	3.000	2.000	1.000	303.000	225.000	2016.000	Western Metropolitan	3464	840000	9.000	Nearby
1	Albert Park	2	h	hockingstuart	2016-09-03	3.300	3206.000	2.000	1.000	0.000	120.000	82.000	1900.000	Southern Metropolitan	3280	1275000	5.000	Nearby
2	Albert Park	2	h	Thomson	2016-09-03	3.300	3206.000	2.000	1.000	0.000	159.000	NaN	NaN	Southern Metropolitan	3280	1455000	5.000	Nearby
3	Alphington	4	h	Brace	2016-09-03	6.400	3078.000	3.000	2.000	4.000	853.000	263.000	1930.000	Northern Metropolitan	2211	2000000	13.000	Nearby
4	Alphington	3	h	Jellis	2016-09-03	6.400	3078.000	3.000	2.000	2.000	208.000	NaN	2013.000	Northern Metropolitan	2211	1110000	10.000	Nearby

Context¶

Data Description:¶

4.1 Sanity Checks¶

4.2 Univariate Analysis¶

4.3 Bivariate Analysis¶

4.4 Missing value treatment¶

4.5 Outlier Detection and Treatment¶

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price
23897	Silvan	3	h	Harcourts	2017-05-27	34.600	3795.000	3.000	2.000	2.000	76000.000	NaN	NaN	Eastern Victoria	457	1085000
25079	Balwyn North	3	h	Kay	2016-08-28	9.200	3104.000	3.000	1.000	2.000	75100.000	NaN	NaN	Southern Metropolitan	7809	2000000

	Date
0	2016
1	2016
2	2016
3	2016
4	2016
...	...
27098	2017
27099	2017
27100	2017
27101	2017
27102	2017

	Count	Percentage
Distance	1	0.004
Postcode	1	0.004
Bedroom	6425	23.706
Bathroom	6431	23.728
Car	6806	25.112
Landsize	9230	34.055
BuildingArea	16574	61.152
YearBuilt	15118	55.780
Total Space	6806	25.112
Distance_bins	1	0.004
AgeofProp	15118	55.780

	Count	Percentage
Bedroom	6424	23.703
Bathroom	6430	23.725
Car	6805	25.109
Landsize	9229	34.053
BuildingArea	16573	61.150
YearBuilt	15117	55.778
Total Space	6805	25.109
AgeofProp	15117	55.778

	Suburb	Rooms	Type	SellerG	Date	Distance	Postcode	Bedroom	Bathroom	Car	Landsize	BuildingArea	YearBuilt	Regionname	Propertycount	Price	Total Space	Distance_bins	AgeofProp
8	Altona North	4	h	hockingstuart	2016-09-03	11.100	3025.000	NaN	NaN	NaN	NaN	NaN	NaN	Western Metropolitan	5132	857500	NaN	Nearby	NaN
12	Ashburton	2	h	Marshall	2016-09-03	11.000	3147.000	NaN	NaN	NaN	NaN	NaN	NaN	Southern Metropolitan	3052	1820000	NaN	Nearby	NaN
14	Avondale Heights	4	h	Jellis	2016-09-03	10.500	3034.000	NaN	NaN	NaN	NaN	NaN	NaN	Western Metropolitan	4502	1310000	NaN	Nearby	NaN
25	Balwyn North	4	u	hockingstuart	2016-09-03	9.200	3104.000	NaN	NaN	NaN	NaN	NaN	NaN	Southern Metropolitan	7809	1450000	NaN	Nearby	NaN
26	Balwyn North	2	h	Fletchers	2016-09-03	9.200	3104.000	NaN	NaN	NaN	NaN	NaN	NaN	Southern Metropolitan	7809	1305000	NaN	Nearby	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
27088	Footscray	2	u	McGrath	2017-09-30	5.100	3011.000	NaN	NaN	NaN	NaN	NaN	NaN	Western Metropolitan	7570	455500	NaN	Nearby	NaN
27089	Forest Hill	2	h	Fletchers	2017-09-30	15.400	3131.000	NaN	NaN	NaN	NaN	NaN	NaN	Eastern Metropolitan	4385	762000	NaN	Moderately Close	NaN
27090	Glen Waverley	3	u	Ray	2017-09-30	16.700	3150.000	NaN	NaN	NaN	NaN	NaN	NaN	Eastern Metropolitan	15321	1100000	NaN	Moderately Close	NaN
27093	Kingsbury	2	t	RW	2017-09-30	12.100	3083.000	NaN	NaN	NaN	NaN	NaN	NaN	Northern Metropolitan	1414	512000	NaN	Nearby	NaN
27095	Mitcham	4	h	Noel	2017-09-30	17.200	3132.000	NaN	NaN	NaN	NaN	NaN	NaN	Eastern Metropolitan	6871	800000	NaN	Moderately Close	NaN

	count
Landsize
NaN	6418
594.000	1
446.000	1
338.000	1
250.000	1
549.000	1
239.000	1

	count
Suburb
Reservoir	236
Bentleigh East	144
St Kilda	116
Glenroy	116
Richmond	111
...	...
Cranbourne East	1
Melton West	1
New Gisborne	1
Scoresby	1
Plenty	1

	count
Regionname
Southern Metropolitan	2197
Northern Metropolitan	1912
Western Metropolitan	1212
Eastern Metropolitan	728
South-Eastern Metropolitan	317
Eastern Victoria	26
Northern Victoria	25
Western Victoria	7

	0
Rooms	0.085
Distance	4.341
Postcode	2.910
Bedroom	0.070
Bathroom	0.870
Car	4.304
Landsize	1.652
Propertycount	2.670
Price	4.658

		Bedroom	Bathroom	Car
Regionname	Type
Eastern Metropolitan	h	3.552	1.807	1.946
	t	3.032	1.853	1.673
	u	2.357	1.263	1.290
Eastern Victoria	h	3.560	1.881	2.098
Eastern Victoria	u	2.667	1.000	1.333
Northern Metropolitan	h	3.106	1.468	1.685
	t	2.567	1.620	1.362
	u	1.878	1.160	1.093
Northern Victoria	h	3.496	1.892	2.146
Northern Victoria	u	3.000	2.000	2.000
South-Eastern Metropolitan	h	3.476	1.713	2.094
	t	2.887	1.849	1.679
	u	2.260	1.205	1.342
Southern Metropolitan	h	3.383	1.840	1.883
	t	3.024	2.012	1.780
	u	1.939	1.191	1.135
Western Metropolitan	h	3.244	1.576	1.907
	t	2.880	1.851	1.538
	u	2.106	1.192	1.144
Western Victoria	h	3.379	1.448	2.060