1. Data preview

import contextily as cx
import geopandas as gpd
import pandas as pd
import plotly.express as px
import seaborn as sns
data = pd.read_csv("../data/2015-street-tree-census-tree-data.csv")
data.head(5)
tree_id block_id created_at tree_dbh stump_diam curb_loc status health spc_latin spc_common ... boro_ct state latitude longitude x_sp y_sp council district census tract bin bbl
0 180683 348711 2015-08-27T00:00:00.000 3 0 OnCurb Alive Fair Acer rubrum red maple ... 4073900 New York 40.723092 -73.844215 1027431.148 202756.7687 29.0 739.0 4052307.0 4.022210e+09
1 200540 315986 2015-09-03T00:00:00.000 21 0 OnCurb Alive Fair Quercus palustris pin oak ... 4097300 New York 40.794111 -73.818679 1034455.701 228644.8374 19.0 973.0 4101931.0 4.044750e+09
2 204026 218365 2015-09-05T00:00:00.000 3 0 OnCurb Alive Good Gleditsia triacanthos var. inermis honeylocust ... 3044900 New York 40.717581 -73.936608 1001822.831 200716.8913 34.0 449.0 3338310.0 3.028870e+09
3 204337 217969 2015-09-05T00:00:00.000 10 0 OnCurb Alive Good Gleditsia triacanthos var. inermis honeylocust ... 3044900 New York 40.713537 -73.934456 1002420.358 199244.2531 34.0 449.0 3338342.0 3.029250e+09
4 189565 223043 2015-08-30T00:00:00.000 21 0 OnCurb Alive Good Tilia americana American linden ... 3016500 New York 40.666778 -73.975979 990913.775 182202.4260 39.0 165.0 3025654.0 3.010850e+09

5 rows × 45 columns

data.shape
(683788, 45)
data.status.value_counts()
status
Alive    652173
Stump     17654
Dead      13961
Name: count, dtype: int64
data[data.status == 'Alive'].health.value_counts()
health
Good    528850
Fair     96504
Poor     26818
Name: count, dtype: int64

2. Nans in data

nans = data.isna().sum(axis=0)
nans
tree_id                  0
block_id                 0
created_at               0
tree_dbh                 0
stump_diam               0
curb_loc                 0
status                   0
health               31616
spc_latin            31619
spc_common           31619
steward             519438
guards              603922
sidewalk             31616
user_type                0
problems            457944
root_stone               0
root_grate               0
root_other               0
trunk_wire               0
trnk_light               0
trnk_other               0
brch_light               0
brch_shoe                0
brch_other               0
address                  0
postcode                 0
zip_city                 0
community board          0
borocode                 0
borough                  0
cncldist                 0
st_assem                 0
st_senate                0
nta                      0
nta_name                 0
boro_ct                  0
state                    0
latitude                 0
longitude                0
x_sp                     0
y_sp                     0
council district      6519
census tract          6519
bin                   9559
bbl                   9559
dtype: int64
fig = px.pie(names=list(nans.index), values=list(nans.values))
fig.update_layout(
    height=600,
    hiddenlabels=list(nans[nans < 10_000].index),
    title="Amount of missed values in each column (if more than 10000)"
)
fig.show()

3. Pairplot

sns.pairplot(data);

4. Correlation matrix between numeric features

data.corr(method="pearson", numeric_only=True)
tree_id block_id tree_dbh stump_diam postcode community board borocode cncldist st_assem st_senate boro_ct latitude longitude x_sp y_sp council district census tract bin bbl
tree_id 1.000000 0.114649 0.098413 0.012563 0.199660 0.235266 0.226142 0.149115 -0.209727 -0.231528 0.229696 -0.131784 0.137696 0.137675 -0.131558 0.146893 0.046812 0.237478 0.231893
block_id 0.114649 1.000000 0.002445 0.003633 -0.074176 0.359408 0.364807 0.063857 0.206664 0.193345 0.358776 0.142741 -0.009611 -0.010409 0.143108 0.059914 0.078268 0.342150 0.363312
tree_dbh 0.098413 0.002445 1.000000 -0.169963 0.099781 0.100037 0.094372 0.057056 -0.144429 -0.133569 0.097330 -0.034252 0.093162 0.093166 -0.034172 0.057132 0.036964 0.097007 0.095945
stump_diam 0.012563 0.003633 -0.169963 1.000000 0.040682 0.022461 0.020702 0.004629 -0.046938 -0.040944 0.021716 -0.004690 0.036701 0.036707 -0.004663 0.004276 0.007326 0.022184 0.021428
postcode 0.199660 -0.074176 0.099781 0.040682 1.000000 0.324896 0.309710 0.169539 -0.674876 -0.648751 0.323152 -0.106493 0.492576 0.492905 -0.106246 0.166506 0.114339 0.340281 0.315693
community board 0.235266 0.359408 0.100037 0.022461 0.324896 1.000000 0.999230 0.679708 -0.543515 -0.593410 0.999150 -0.627760 -0.171519 -0.171635 -0.627284 0.677678 0.176613 0.996748 0.999631
borocode 0.226142 0.364807 0.094372 0.020702 0.309710 0.999230 1.000000 0.676715 -0.529001 -0.580983 0.999268 -0.624571 -0.191089 -0.191214 -0.624108 0.674621 0.169195 0.995559 0.999541
cncldist 0.149115 0.063857 0.057056 0.004629 0.169539 0.679708 0.676715 1.000000 -0.139636 -0.194772 0.670272 -0.885264 -0.554068 -0.553739 -0.885337 0.999771 -0.018487 0.670717 0.672331
st_assem -0.209727 0.206664 -0.144429 -0.046938 -0.674876 -0.543515 -0.529001 -0.139636 1.000000 0.932196 -0.544253 0.223407 -0.489973 -0.490305 0.222944 -0.137477 -0.239845 -0.562343 -0.537014
st_senate -0.231528 0.193345 -0.133569 -0.040944 -0.648751 -0.593410 -0.580983 -0.194772 0.932196 1.000000 -0.595534 0.278002 -0.439158 -0.439504 0.277535 -0.192079 -0.224303 -0.611019 -0.587548
boro_ct 0.229696 0.358776 0.097330 0.021716 0.323152 0.999150 0.999268 0.670272 -0.544253 -0.595534 1.000000 -0.617911 -0.169691 -0.169818 -0.617434 0.668110 0.183375 0.996119 0.999326
latitude -0.131784 0.142741 -0.034252 -0.004690 -0.106493 -0.627760 -0.624571 -0.885264 0.223407 0.278002 -0.617911 1.000000 0.572289 0.571812 0.999999 -0.886017 -0.014834 -0.628894 -0.626599
longitude 0.137696 -0.009611 0.093162 0.036701 0.492576 -0.171519 -0.191089 -0.554068 -0.489973 -0.439158 -0.169691 0.572289 1.000000 0.999999 0.572757 -0.557000 0.220531 -0.152223 -0.178745
x_sp 0.137675 -0.010409 0.093166 0.036707 0.492905 -0.171635 -0.191214 -0.553739 -0.490305 -0.439504 -0.169818 0.571812 0.999999 1.000000 0.572280 -0.556669 0.220356 -0.152314 -0.178864
y_sp -0.131558 0.143108 -0.034172 -0.004663 -0.106246 -0.627284 -0.624108 -0.885337 0.222944 0.277535 -0.617434 0.999999 0.572757 0.572280 1.000000 -0.886092 -0.014404 -0.628408 -0.626121
council district 0.146893 0.059914 0.057132 0.004276 0.166506 0.677678 0.674621 0.999771 -0.137477 -0.192079 0.668110 -0.886017 -0.557000 -0.556669 -0.886092 1.000000 -0.018348 0.670766 0.672369
census tract 0.046812 0.078268 0.036964 0.007326 0.114339 0.176613 0.169195 -0.018487 -0.239845 -0.224303 0.183375 -0.014834 0.220531 0.220356 -0.014404 -0.018348 1.000000 0.183009 0.174843
bin 0.237478 0.342150 0.097007 0.022184 0.340281 0.996748 0.995559 0.670717 -0.562343 -0.611019 0.996119 -0.628894 -0.152223 -0.152314 -0.628408 0.670766 0.183009 1.000000 0.996809
bbl 0.231893 0.363312 0.095945 0.021428 0.315693 0.999631 0.999541 0.672331 -0.537014 -0.587548 0.999326 -0.626599 -0.178745 -0.178864 -0.626121 0.672369 0.174843 0.996809 1.000000

5. Trees distribution on New Yorks map

gdf = gpd.GeoDataFrame(
    data, geometry=gpd.points_from_xy(data.longitude, data.latitude), crs="EPSG:4326"
)
ax = gdf.plot(markersize=0.1, figsize=(8, 8));
ax.set_title("Distribution of trees in New York")
cx.add_basemap(ax, crs=gdf.crs);