import contextily as cx
import geopandas as gpd
import pandas as pd
import plotly.express as px
import seaborn as sns1. Data preview
data = pd.read_csv("../data/2015-street-tree-census-tree-data.csv")
data.head(5)| tree_id | block_id | created_at | tree_dbh | stump_diam | curb_loc | status | health | spc_latin | spc_common | ... | boro_ct | state | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 180683 | 348711 | 2015-08-27T00:00:00.000 | 3 | 0 | OnCurb | Alive | Fair | Acer rubrum | red maple | ... | 4073900 | New York | 40.723092 | -73.844215 | 1027431.148 | 202756.7687 | 29.0 | 739.0 | 4052307.0 | 4.022210e+09 |
| 1 | 200540 | 315986 | 2015-09-03T00:00:00.000 | 21 | 0 | OnCurb | Alive | Fair | Quercus palustris | pin oak | ... | 4097300 | New York | 40.794111 | -73.818679 | 1034455.701 | 228644.8374 | 19.0 | 973.0 | 4101931.0 | 4.044750e+09 |
| 2 | 204026 | 218365 | 2015-09-05T00:00:00.000 | 3 | 0 | OnCurb | Alive | Good | Gleditsia triacanthos var. inermis | honeylocust | ... | 3044900 | New York | 40.717581 | -73.936608 | 1001822.831 | 200716.8913 | 34.0 | 449.0 | 3338310.0 | 3.028870e+09 |
| 3 | 204337 | 217969 | 2015-09-05T00:00:00.000 | 10 | 0 | OnCurb | Alive | Good | Gleditsia triacanthos var. inermis | honeylocust | ... | 3044900 | New York | 40.713537 | -73.934456 | 1002420.358 | 199244.2531 | 34.0 | 449.0 | 3338342.0 | 3.029250e+09 |
| 4 | 189565 | 223043 | 2015-08-30T00:00:00.000 | 21 | 0 | OnCurb | Alive | Good | Tilia americana | American linden | ... | 3016500 | New York | 40.666778 | -73.975979 | 990913.775 | 182202.4260 | 39.0 | 165.0 | 3025654.0 | 3.010850e+09 |
5 rows × 45 columns
data.shape(683788, 45)
data.status.value_counts()status
Alive 652173
Stump 17654
Dead 13961
Name: count, dtype: int64
data[data.status == 'Alive'].health.value_counts()health
Good 528850
Fair 96504
Poor 26818
Name: count, dtype: int64
2. Nans in data
nans = data.isna().sum(axis=0)
nanstree_id 0
block_id 0
created_at 0
tree_dbh 0
stump_diam 0
curb_loc 0
status 0
health 31616
spc_latin 31619
spc_common 31619
steward 519438
guards 603922
sidewalk 31616
user_type 0
problems 457944
root_stone 0
root_grate 0
root_other 0
trunk_wire 0
trnk_light 0
trnk_other 0
brch_light 0
brch_shoe 0
brch_other 0
address 0
postcode 0
zip_city 0
community board 0
borocode 0
borough 0
cncldist 0
st_assem 0
st_senate 0
nta 0
nta_name 0
boro_ct 0
state 0
latitude 0
longitude 0
x_sp 0
y_sp 0
council district 6519
census tract 6519
bin 9559
bbl 9559
dtype: int64
fig = px.pie(names=list(nans.index), values=list(nans.values))
fig.update_layout(
height=600,
hiddenlabels=list(nans[nans < 10_000].index),
title="Amount of missed values in each column (if more than 10000)"
)
fig.show()3. Pairplot
sns.pairplot(data);
4. Correlation matrix between numeric features
data.corr(method="pearson", numeric_only=True)| tree_id | block_id | tree_dbh | stump_diam | postcode | community board | borocode | cncldist | st_assem | st_senate | boro_ct | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| tree_id | 1.000000 | 0.114649 | 0.098413 | 0.012563 | 0.199660 | 0.235266 | 0.226142 | 0.149115 | -0.209727 | -0.231528 | 0.229696 | -0.131784 | 0.137696 | 0.137675 | -0.131558 | 0.146893 | 0.046812 | 0.237478 | 0.231893 |
| block_id | 0.114649 | 1.000000 | 0.002445 | 0.003633 | -0.074176 | 0.359408 | 0.364807 | 0.063857 | 0.206664 | 0.193345 | 0.358776 | 0.142741 | -0.009611 | -0.010409 | 0.143108 | 0.059914 | 0.078268 | 0.342150 | 0.363312 |
| tree_dbh | 0.098413 | 0.002445 | 1.000000 | -0.169963 | 0.099781 | 0.100037 | 0.094372 | 0.057056 | -0.144429 | -0.133569 | 0.097330 | -0.034252 | 0.093162 | 0.093166 | -0.034172 | 0.057132 | 0.036964 | 0.097007 | 0.095945 |
| stump_diam | 0.012563 | 0.003633 | -0.169963 | 1.000000 | 0.040682 | 0.022461 | 0.020702 | 0.004629 | -0.046938 | -0.040944 | 0.021716 | -0.004690 | 0.036701 | 0.036707 | -0.004663 | 0.004276 | 0.007326 | 0.022184 | 0.021428 |
| postcode | 0.199660 | -0.074176 | 0.099781 | 0.040682 | 1.000000 | 0.324896 | 0.309710 | 0.169539 | -0.674876 | -0.648751 | 0.323152 | -0.106493 | 0.492576 | 0.492905 | -0.106246 | 0.166506 | 0.114339 | 0.340281 | 0.315693 |
| community board | 0.235266 | 0.359408 | 0.100037 | 0.022461 | 0.324896 | 1.000000 | 0.999230 | 0.679708 | -0.543515 | -0.593410 | 0.999150 | -0.627760 | -0.171519 | -0.171635 | -0.627284 | 0.677678 | 0.176613 | 0.996748 | 0.999631 |
| borocode | 0.226142 | 0.364807 | 0.094372 | 0.020702 | 0.309710 | 0.999230 | 1.000000 | 0.676715 | -0.529001 | -0.580983 | 0.999268 | -0.624571 | -0.191089 | -0.191214 | -0.624108 | 0.674621 | 0.169195 | 0.995559 | 0.999541 |
| cncldist | 0.149115 | 0.063857 | 0.057056 | 0.004629 | 0.169539 | 0.679708 | 0.676715 | 1.000000 | -0.139636 | -0.194772 | 0.670272 | -0.885264 | -0.554068 | -0.553739 | -0.885337 | 0.999771 | -0.018487 | 0.670717 | 0.672331 |
| st_assem | -0.209727 | 0.206664 | -0.144429 | -0.046938 | -0.674876 | -0.543515 | -0.529001 | -0.139636 | 1.000000 | 0.932196 | -0.544253 | 0.223407 | -0.489973 | -0.490305 | 0.222944 | -0.137477 | -0.239845 | -0.562343 | -0.537014 |
| st_senate | -0.231528 | 0.193345 | -0.133569 | -0.040944 | -0.648751 | -0.593410 | -0.580983 | -0.194772 | 0.932196 | 1.000000 | -0.595534 | 0.278002 | -0.439158 | -0.439504 | 0.277535 | -0.192079 | -0.224303 | -0.611019 | -0.587548 |
| boro_ct | 0.229696 | 0.358776 | 0.097330 | 0.021716 | 0.323152 | 0.999150 | 0.999268 | 0.670272 | -0.544253 | -0.595534 | 1.000000 | -0.617911 | -0.169691 | -0.169818 | -0.617434 | 0.668110 | 0.183375 | 0.996119 | 0.999326 |
| latitude | -0.131784 | 0.142741 | -0.034252 | -0.004690 | -0.106493 | -0.627760 | -0.624571 | -0.885264 | 0.223407 | 0.278002 | -0.617911 | 1.000000 | 0.572289 | 0.571812 | 0.999999 | -0.886017 | -0.014834 | -0.628894 | -0.626599 |
| longitude | 0.137696 | -0.009611 | 0.093162 | 0.036701 | 0.492576 | -0.171519 | -0.191089 | -0.554068 | -0.489973 | -0.439158 | -0.169691 | 0.572289 | 1.000000 | 0.999999 | 0.572757 | -0.557000 | 0.220531 | -0.152223 | -0.178745 |
| x_sp | 0.137675 | -0.010409 | 0.093166 | 0.036707 | 0.492905 | -0.171635 | -0.191214 | -0.553739 | -0.490305 | -0.439504 | -0.169818 | 0.571812 | 0.999999 | 1.000000 | 0.572280 | -0.556669 | 0.220356 | -0.152314 | -0.178864 |
| y_sp | -0.131558 | 0.143108 | -0.034172 | -0.004663 | -0.106246 | -0.627284 | -0.624108 | -0.885337 | 0.222944 | 0.277535 | -0.617434 | 0.999999 | 0.572757 | 0.572280 | 1.000000 | -0.886092 | -0.014404 | -0.628408 | -0.626121 |
| council district | 0.146893 | 0.059914 | 0.057132 | 0.004276 | 0.166506 | 0.677678 | 0.674621 | 0.999771 | -0.137477 | -0.192079 | 0.668110 | -0.886017 | -0.557000 | -0.556669 | -0.886092 | 1.000000 | -0.018348 | 0.670766 | 0.672369 |
| census tract | 0.046812 | 0.078268 | 0.036964 | 0.007326 | 0.114339 | 0.176613 | 0.169195 | -0.018487 | -0.239845 | -0.224303 | 0.183375 | -0.014834 | 0.220531 | 0.220356 | -0.014404 | -0.018348 | 1.000000 | 0.183009 | 0.174843 |
| bin | 0.237478 | 0.342150 | 0.097007 | 0.022184 | 0.340281 | 0.996748 | 0.995559 | 0.670717 | -0.562343 | -0.611019 | 0.996119 | -0.628894 | -0.152223 | -0.152314 | -0.628408 | 0.670766 | 0.183009 | 1.000000 | 0.996809 |
| bbl | 0.231893 | 0.363312 | 0.095945 | 0.021428 | 0.315693 | 0.999631 | 0.999541 | 0.672331 | -0.537014 | -0.587548 | 0.999326 | -0.626599 | -0.178745 | -0.178864 | -0.626121 | 0.672369 | 0.174843 | 0.996809 | 1.000000 |
5. Trees distribution on New Yorks map
gdf = gpd.GeoDataFrame(
data, geometry=gpd.points_from_xy(data.longitude, data.latitude), crs="EPSG:4326"
)ax = gdf.plot(markersize=0.1, figsize=(8, 8));
ax.set_title("Distribution of trees in New York")
cx.add_basemap(ax, crs=gdf.crs);