# standard library
import copy
from math import sqrt
from typing import Callable, Dict, List, Optional, Union
# 3rd-party
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import xarray as xr
from dask import delayed
from xarray import DataArray
try:
import cupy
except ImportError:
class cupy(object):
ndarray = False
# local modules
from xrspatial.utils import ArrayTypeFunctionMapping, ngjit, not_implemented_func, validate_arrays
TOTAL_COUNT = '_total_count'
def _stats_count(data):
if isinstance(data, np.ndarray):
# numpy case
stats_count = np.ma.count(data)
elif isinstance(data, cupy.ndarray):
# cupy case
stats_count = np.prod(data.shape)
else:
# dask case
stats_count = data.size - da.ma.getmaskarray(data).sum()
return stats_count
_DEFAULT_STATS = dict(
mean=lambda z: z.mean(),
max=lambda z: z.max(),
min=lambda z: z.min(),
sum=lambda z: z.sum(),
std=lambda z: z.std(),
var=lambda z: z.var(),
count=lambda z: _stats_count(z),
)
_DASK_BLOCK_STATS = dict(
max=lambda z: z.max(),
min=lambda z: z.min(),
sum=lambda z: z.sum(),
count=lambda z: _stats_count(z),
sum_squares=lambda z: (z**2).sum()
)
_DASK_STATS = dict(
max=lambda block_maxes: np.nanmax(block_maxes, axis=0),
min=lambda block_mins: np.nanmin(block_mins, axis=0),
sum=lambda block_sums: np.nansum(block_sums, axis=0),
count=lambda block_counts: np.nansum(block_counts, axis=0),
sum_squares=lambda block_sum_squares: np.nansum(block_sum_squares, axis=0),
squared_sum=lambda block_sums: np.nansum(block_sums, axis=0)**2,
)
def _dask_mean(sums, counts): return sums / counts # noqa
def _dask_std(sum_squares, squared_sum, n): return np.sqrt((sum_squares - squared_sum/n) / n) # noqa
def _dask_var(sum_squares, squared_sum, n): return (sum_squares - squared_sum/n) / n # noqa
@ngjit
def _strides(flatten_zones, unique_zones):
num_elements = flatten_zones.shape[0]
num_zones = len(unique_zones)
strides = np.zeros(len(unique_zones), dtype=np.int32)
count = 0
for i in range(num_zones):
while (count < num_elements) and (
flatten_zones[count] == unique_zones[i]):
count += 1
strides[i] = count
return strides
def _sort_and_stride(zones, values, unique_zones):
flatten_zones = zones.ravel()
sorted_indices = np.argsort(flatten_zones)
sorted_zones = flatten_zones[sorted_indices]
values_shape = values.shape
if len(values_shape) == 3:
values_by_zones = copy.deepcopy(values).reshape(
values_shape[0], values_shape[1] * values_shape[2])
for i in range(values_shape[0]):
values_by_zones[i] = values_by_zones[i][sorted_indices]
else:
values_by_zones = values.ravel()[sorted_indices]
# exclude nans from calculation
# flatten_zones is already sorted, NaN elements (if any) are at the end
# of the array, removing them will not affect data before them
sorted_zones = sorted_zones[np.isfinite(sorted_zones)]
zone_breaks = _strides(sorted_zones, unique_zones)
return sorted_indices, values_by_zones, zone_breaks
def _calc_stats(
values_by_zones: np.array,
zone_breaks: np.array,
unique_zones: np.array,
zone_ids: np.array,
func: Callable,
nodata_values: Union[int, float] = None,
):
start = 0
results = np.full(unique_zones.shape, np.nan)
for i in range(len(unique_zones)):
end = zone_breaks[i]
if unique_zones[i] in zone_ids:
zone_values = values_by_zones[start:end]
# filter out non-finite and nodata_values
zone_values = zone_values[np.isfinite(zone_values) & (zone_values != nodata_values)]
if len(zone_values) > 0:
results[i] = func(zone_values)
start = end
return results
@delayed
def _single_stats_func(
zones_block: np.array,
values_block: np.array,
unique_zones: np.array,
zone_ids: np.array,
func: Callable,
nodata_values: Union[int, float] = None,
) -> pd.DataFrame:
_, values_by_zones, zone_breaks = _sort_and_stride(zones_block, values_block, unique_zones)
results = _calc_stats(values_by_zones, zone_breaks, unique_zones, zone_ids, func, nodata_values)
return results
def _stats_dask_numpy(
zones: da.Array,
values: da.Array,
zone_ids: List[Union[int, float]],
stats_funcs: Dict,
nodata_values: Union[int, float],
) -> pd.DataFrame:
# find ids for all zones
unique_zones = np.unique(zones[np.isfinite(zones)])
select_all_zones = False
# selecte zones to do analysis
if zone_ids is None:
zone_ids = unique_zones
select_all_zones = True
zones_blocks = zones.to_delayed().ravel()
values_blocks = values.to_delayed().ravel()
stats_dict = {}
stats_dict["zone"] = unique_zones # zone column
compute_sum_squares = False
compute_sum = False
compute_count = False
if 'mean' or 'std' or 'var' in stats_funcs:
compute_sum = True
compute_count = True
if 'std' or 'var' in stats_funcs:
compute_sum_squares = True
basis_stats = [s for s in _DASK_BLOCK_STATS if s in stats_funcs]
if compute_count and 'count' not in basis_stats:
basis_stats.append('count')
if compute_sum and 'sum' not in basis_stats:
basis_stats.append('sum')
if compute_sum_squares:
basis_stats.append('sum_squares')
dask_dtypes = dict(
max=values.dtype,
min=values.dtype,
sum=values.dtype,
count=np.int64,
sum_squares=values.dtype,
squared_sum=values.dtype,
)
for s in basis_stats:
if s == 'sum_squares' and not compute_sum_squares:
continue
stats_func = _DASK_BLOCK_STATS.get(s)
stats_by_block = [
da.from_delayed(
delayed(_single_stats_func)(
z, v, unique_zones, zone_ids, stats_func, nodata_values
), shape=(np.nan,), dtype=dask_dtypes[s]
)
for z, v in zip(zones_blocks, values_blocks)
]
zonal_stats = da.stack(stats_by_block, allow_unknown_chunksizes=True)
stats_func_by_block = delayed(_DASK_STATS[s])
stats_dict[s] = da.from_delayed(
stats_func_by_block(zonal_stats), shape=(np.nan,), dtype=np.float64
)
if 'mean' in stats_funcs:
stats_dict['mean'] = _dask_mean(stats_dict['sum'], stats_dict['count'])
if 'std' in stats_funcs:
stats_dict['std'] = _dask_std(
stats_dict['sum_squares'], stats_dict['sum'] ** 2, stats_dict['count']
)
if 'var' in stats_funcs:
stats_dict['var'] = _dask_var(
stats_dict['sum_squares'], stats_dict['sum'] ** 2, stats_dict['count']
)
# generate dask dataframe
stats_df = dd.concat([dd.from_dask_array(s) for s in stats_dict.values()], axis=1)
# name columns
stats_df.columns = stats_dict.keys()
# select columns
stats_df = stats_df[['zone'] + list(stats_funcs.keys())]
if not select_all_zones:
# only return zones specified in `zone_ids`
selected_rows = []
for index, row in stats_df.iterrows():
if row['zone'] in zone_ids:
selected_rows.append(stats_df.loc[index])
stats_df = dd.concat(selected_rows)
return stats_df
def _stats_numpy(
zones: xr.DataArray,
values: xr.DataArray,
zone_ids: List[Union[int, float]],
stats_funcs: Dict,
nodata_values: Union[int, float],
return_type: str,
) -> Union[pd.DataFrame, np.ndarray]:
# find ids for all zones
unique_zones = np.unique(zones[np.isfinite(zones)])
# selected zones to do analysis
if zone_ids is None:
zone_ids = unique_zones
else:
zone_ids = np.unique(zone_ids)
# remove zones that do not exist in `zones` raster
zone_ids = [z for z in zone_ids if z in unique_zones]
sorted_indices, values_by_zones, zone_breaks = _sort_and_stride(zones, values, unique_zones)
if return_type == 'pandas.DataFrame':
stats_dict = {}
stats_dict["zone"] = zone_ids
selected_indexes = [i for i, z in enumerate(unique_zones) if z in zone_ids]
for stats in stats_funcs:
func = stats_funcs.get(stats)
stats_dict[stats] = _calc_stats(
values_by_zones, zone_breaks,
unique_zones, zone_ids, func, nodata_values
)
stats_dict[stats] = stats_dict[stats][selected_indexes]
result = pd.DataFrame(stats_dict)
else:
result = np.full((len(stats_funcs), values.size), np.nan)
zone_ids_map = {z: i for i, z in enumerate(unique_zones) if z in zone_ids}
stats_id = 0
for stats in stats_funcs:
func = stats_funcs.get(stats)
stats_results = _calc_stats(
values_by_zones, zone_breaks,
unique_zones, zone_ids, func, nodata_values
)
for zone in zone_ids:
iz = zone_ids_map[zone] # position of zone in unique_zones
if iz == 0:
zs = sorted_indices[: zone_breaks[iz]]
else:
zs = sorted_indices[zone_breaks[iz-1]: zone_breaks[iz]]
result[stats_id][zs] = stats_results[iz]
stats_id += 1
result = result.reshape(len(stats_funcs), *values.shape)
return result
def _stats_cupy(
orig_zones: xr.DataArray,
orig_values: xr.DataArray,
zone_ids: List[Union[int, float]],
stats_funcs: Dict,
nodata_values: Union[int, float],
) -> pd.DataFrame:
# TODO add support for 3D input
if len(orig_values.shape) > 2:
raise TypeError('3D inputs not supported for cupy backend')
zones = cupy.ravel(orig_zones)
values = cupy.ravel(orig_values)
sorted_indices = cupy.argsort(zones)
sorted_zones = zones[sorted_indices]
values_by_zone = values[sorted_indices]
# filter out values that are non-finite or values equal to nodata_values
if nodata_values:
filter_values = cupy.isfinite(values_by_zone) & (
values_by_zone != nodata_values)
else:
filter_values = cupy.isfinite(values_by_zone)
values_by_zone = values_by_zone[filter_values]
sorted_zones = sorted_zones[filter_values]
# Now I need to find the unique zones, and zone breaks
unique_zones, unique_index, unique_counts = cupy.unique(
sorted_zones, return_index=True, return_counts=True)
# Transfer to the host
unique_index = unique_index.get()
unique_counts = unique_counts.get()
unique_zones = unique_zones.get()
if zone_ids is not None:
# We need to extract the index and element count
# only for the elements in zone_ids
unique_index_lst = []
unique_counts_lst = []
unique_zones = list(unique_zones)
for z in zone_ids:
try:
idx = unique_zones.index(z)
unique_index_lst.append(unique_index[idx])
unique_counts_lst.append(unique_counts[idx])
except ValueError:
continue
unique_zones = zone_ids
unique_counts = unique_counts_lst
unique_index = unique_index_lst
# stats columns
stats_dict = {'zone': []}
for stats in stats_funcs:
stats_dict[stats] = []
for i in range(len(unique_zones)):
zone_id = unique_zones[i]
# skip zone_id == nodata_zones, and non-finite zone ids
if not np.isfinite(zone_id):
continue
stats_dict['zone'].append(zone_id)
# extract zone_values
zone_values = values_by_zone[unique_index[i]:unique_index[i]+unique_counts[i]]
# apply stats on the zone data
for j, stats in enumerate(stats_funcs):
stats_func = stats_funcs.get(stats)
if not callable(stats_func):
raise ValueError(stats)
result = stats_func(zone_values)
assert(len(result.shape) == 0)
stats_dict[stats].append(cupy.float_(result))
stats_df = pd.DataFrame(stats_dict)
stats_df.set_index("zone")
return stats_df
[docs]def stats(
zones: xr.DataArray,
values: xr.DataArray,
zone_ids: Optional[List[Union[int, float]]] = None,
stats_funcs: Union[Dict, List] = [
"mean",
"max",
"min",
"sum",
"std",
"var",
"count",
],
nodata_values: Union[int, float] = None,
return_type: str = 'pandas.DataFrame',
) -> Union[pd.DataFrame, dd.DataFrame, xr.DataArray]:
"""
Calculate summary statistics for each zone defined by a `zones`
dataset, based on `values` aggregate.
A single output value is computed for every zone in the input `zones`
dataset.
This function currently supports numpy backed, and dask with numpy backed
xarray DataArrays.
Parameters
----------
zones : xr.DataArray
zones is a 2D xarray DataArray of numeric values.
A zone is all the cells in a raster that have the same value,
whether or not they are contiguous. The input `zones` raster defines
the shape, values, and locations of the zones. An integer field
in the input `zones` DataArray defines a zone.
values : xr.DataArray
values is a 2D xarray DataArray of numeric values (integers or floats).
The input `values` raster contains the input values used in
calculating the output statistic for each zone. In dask case,
the chunksizes of `zones` and `values` should be matching. If not,
`values` will be rechunked to be the same as of `zones`.
zone_ids : list of ints, or floats
List of zones to be included in calculation. If no zone_ids provided,
all zones will be used.
stats_funcs : dict, or list of strings, default=['mean', 'max', 'min',
'sum', 'std', 'var', 'count']
The statistics to calculate for each zone. If a list, possible
choices are subsets of the default options.
In the dictionary case, all of its values must be
callable. Function takes only one argument that is the `values` raster.
The key become the column name in the output DataFrame.
Note that if `zones` and `values` are dask backed DataArrays,
`stats_funcs` must be provided as a list that is a subset of
default supported stats.
nodata_values: int, float, default=None
Nodata value in `values` raster.
Cells with `nodata_values` do not belong to any zone,
and thus excluded from calculation.
return_type: str, default='pandas.DataFrame'
Format of returned data. If `zones` and `values` numpy backed xarray DataArray,
allowed values are 'pandas.DataFrame', and 'xarray.DataArray'.
Otherwise, only 'pandas.DataFrame' is supported.
Returns
-------
stats_df : Union[pandas.DataFrame, dask.dataframe.DataFrame]
A pandas DataFrame, or a dask DataFrame where each column
is a statistic and each row is a zone with zone id.
Examples
--------
stats() works with NumPy backed DataArray
.. sourcecode:: python
>>> import numpy as np
>>> import xarray as xr
>>> from xrspatial.zonal import stats
>>> height, width = 10, 10
>>> values_data = np.array([
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])
>>> values = xr.DataArray(values_data)
>>> zones_data = np.array([
[ 0., 0., 0., 0., 0., 10., 10., 10., 10., 10.],
[ 0., 0., 0., 0., 0., 10., 10., 10., 10., 10.],
[ 0., 0., 0., 0., 0., 10., 10., 10., 10., 10.],
[ 0., 0., 0., 0., 0., 10., 10., 10., 10., 10.],
[ 0., 0., 0., 0., 0., 10., 10., 10., 10., 10.],
[20., 20., 20., 20., 20., 30., 30., 30., 30., 30.],
[20., 20., 20., 20., 20., 30., 30., 30., 30., 30.],
[20., 20., 20., 20., 20., 30., 30., 30., 30., 30.],
[20., 20., 20., 20., 20., 30., 30., 30., 30., 30.],
[20., 20., 20., 20., 20., 30., 30., 30., 30., 30.]])
>>> zones = xr.DataArray(zones_data)
>>> # Calculate Stats
>>> stats_df = stats(zones=zones, values=values)
>>> print(stats_df)
zone mean max min sum std var count
0 0 22.0 44 0 550 14.21267 202.0 25
1 10 27.0 49 5 675 14.21267 202.0 25
2 20 72.0 94 50 1800 14.21267 202.0 25
3 30 77.0 99 55 1925 14.21267 202.0 25
>>> # Custom Stats
>>> custom_stats ={'double_sum': lambda val: val.sum()*2}
>>> custom_stats_df = stats(zones=zones,
values=values,
stats_funcs=custom_stats)
>>> print(custom_stats_df)
zone double_sum
0 0 1100
1 10 1350
2 20 3600
3 30 3850
stats() works with Dask with NumPy backed DataArray
>>> import dask.array as da
>>> import dask.array as da
>>> values_dask = xr.DataArray(da.from_array(values_data, chunks=(3, 3)))
>>> zones_dask = xr.DataArray(da.from_array(zones_data, chunks=(3, 3)))
>>> # Calculate Stats with dask backed xarray DataArrays
>>> dask_stats_df = stats(zones=zones_dask, values=values_dask)
>>> print(type(dask_stats_df))
<class 'dask.dataframe.core.DataFrame'>
>>> print(dask_stats_df.compute())
zone mean max min sum std var count
0 0 22.0 44 0 550 14.21267 202.0 25
1 10 27.0 49 5 675 14.21267 202.0 25
2 20 72.0 94 50 1800 14.21267 202.0 25
3 30 77.0 99 55 1925 14.21267 202.0 25
"""
validate_arrays(zones, values)
if not (
issubclass(zones.data.dtype.type, np.integer)
or issubclass(zones.data.dtype.type, np.floating)
):
raise ValueError("`zones` must be an array of integers or floats.")
if not (
issubclass(values.data.dtype.type, np.integer)
or issubclass(values.data.dtype.type, np.floating)
):
raise ValueError("`values` must be an array of integers or floats.")
# validate stats_funcs
if isinstance(values.data, da.Array) and not isinstance(stats_funcs, list):
raise ValueError(
"Got dask-backed DataArray as `values` aggregate. "
"`stats_funcs` must be a subset of default supported stats "
"`[\'mean\', \'max\', \'min\', \'sum\', \'std\', \'var\', \'count\']`"
)
if isinstance(stats_funcs, list):
# create a dict of stats
stats_funcs_dict = {}
for stats in stats_funcs:
func = _DEFAULT_STATS.get(stats, None)
if func is None:
err_str = f"Invalid stat name. {stats} option not supported."
raise ValueError(err_str)
stats_funcs_dict[stats] = func
elif isinstance(stats_funcs, dict):
stats_funcs_dict = stats_funcs.copy()
mapper = ArrayTypeFunctionMapping(
numpy_func=lambda *args: _stats_numpy(*args, return_type=return_type),
dask_func=_stats_dask_numpy,
cupy_func=_stats_cupy,
dask_cupy_func=lambda *args: not_implemented_func(
*args, messages='stats() does not support dask with cupy backed DataArray' # noqa
),
)
result = mapper(values)(
zones.data, values.data, zone_ids, stats_funcs_dict, nodata_values,
)
if return_type == 'xarray.DataArray':
return xr.DataArray(
result,
coords={'stats': list(stats_funcs_dict.keys()), **values.coords},
dims=('stats', *values.dims),
attrs=values.attrs
)
return result
def _find_cats(values, cat_ids, nodata_values):
if len(values.shape) == 2:
# 2D case
unique_cats = np.unique(values.data[
np.isfinite(values.data) & (values.data != nodata_values)
])
else:
# 3D case
unique_cats = values[values.dims[0]].data
if cat_ids is None:
cat_ids = unique_cats
else:
if isinstance(values.data, np.ndarray):
# remove cats that do not exist in `values` raster
cat_ids = [c for c in cat_ids if c in unique_cats]
else:
cat_ids = _select_ids(unique_cats, cat_ids)
return unique_cats, cat_ids
def _get_zone_values(values_by_zones, start, end):
if len(values_by_zones.shape) == 1:
# 1D flatten, i.e, original data is 2D
return values_by_zones[start:end]
else:
# 2D flatten, i.e, original data is 3D
return values_by_zones[:, start:end]
def _single_zone_crosstab_2d(
zone_values,
unique_cats,
cat_ids,
nodata_values,
crosstab_dict,
):
# 1D flatten zone_values, i.e, original data is 2D
# filter out non-finite and nodata_values
zone_values = zone_values[
np.isfinite(zone_values) & (zone_values != nodata_values)
]
total_count = zone_values.shape[0]
crosstab_dict[TOTAL_COUNT].append(total_count)
sorted_zone_values = np.sort(zone_values)
zone_cat_breaks = _strides(sorted_zone_values, unique_cats)
cat_start = 0
for j, cat in enumerate(unique_cats):
if cat in cat_ids:
count = zone_cat_breaks[j] - cat_start
crosstab_dict[cat].append(count)
cat_start = zone_cat_breaks[j]
def _single_zone_crosstab_3d(
zone_values,
unique_cats,
cat_ids,
nodata_values,
crosstab_dict,
stats_func
):
# 2D flatten `zone_values`, i.e, original data is 3D
for j, cat in enumerate(unique_cats):
if cat in cat_ids:
zone_cat_data = zone_values[j]
# filter out non-finite and nodata_values
zone_cat_data = zone_cat_data[
np.isfinite(zone_cat_data)
& (zone_cat_data != nodata_values)
]
crosstab_dict[cat].append(stats_func(zone_cat_data))
def _crosstab_numpy(
zones: np.ndarray,
values: np.ndarray,
zone_ids: List[Union[int, float]],
unique_cats: np.ndarray,
cat_ids: Union[List, np.ndarray],
nodata_values: Union[int, float],
agg: str,
) -> pd.DataFrame:
# find ids for all zones
unique_zones = np.unique(zones[np.isfinite(zones)])
# selected zones to do analysis
if zone_ids is None:
zone_ids = unique_zones
else:
# remove zones that do not exist in `zones` raster
zone_ids = [z for z in zone_ids if z in unique_zones]
crosstab_dict = {}
crosstab_dict["zone"] = zone_ids
if len(values.shape) == 2:
crosstab_dict[TOTAL_COUNT] = []
for cat in cat_ids:
crosstab_dict[cat] = []
_, values_by_zones, zone_breaks = _sort_and_stride(
zones, values, unique_zones
)
start = 0
for i in range(len(unique_zones)):
end = zone_breaks[i]
if unique_zones[i] in zone_ids:
# get data for zone unique_zones[i]
zone_values = _get_zone_values(values_by_zones, start, end)
if len(values.shape) == 2:
_single_zone_crosstab_2d(
zone_values, unique_cats, cat_ids, nodata_values, crosstab_dict
)
else:
_single_zone_crosstab_3d(
zone_values, unique_cats, cat_ids, nodata_values, crosstab_dict, _DEFAULT_STATS[agg] # noqa
)
start = end
if TOTAL_COUNT in crosstab_dict:
crosstab_dict[TOTAL_COUNT] = np.array(
crosstab_dict[TOTAL_COUNT], dtype=np.float32
)
for cat in cat_ids:
crosstab_dict[cat] = np.array(crosstab_dict[cat])
# construct output dataframe
if agg == 'percentage':
# replace 0s with nans to avoid dividing by 0 error
crosstab_dict[TOTAL_COUNT][crosstab_dict[TOTAL_COUNT] == 0] = np.nan
for cat in cat_ids:
crosstab_dict[cat] = crosstab_dict[cat] / crosstab_dict[TOTAL_COUNT] * 100 # noqa
crosstab_df = pd.DataFrame(crosstab_dict)
crosstab_df = crosstab_df[['zone'] + list(cat_ids)]
return crosstab_df
@delayed
def _single_chunk_crosstab(
zones_block: np.array,
values_block: np.array,
unique_zones: np.array,
zone_ids: np.array,
unique_cats,
cat_ids,
nodata_values: Union[int, float],
):
results = {}
if len(values_block.shape) == 2:
results[TOTAL_COUNT] = []
for cat in cat_ids:
results[cat] = []
_, values_by_zones, zone_breaks = _sort_and_stride(
zones_block, values_block, unique_zones
)
start = 0
for i in range(len(unique_zones)):
end = zone_breaks[i]
if unique_zones[i] in zone_ids:
# get data for zone unique_zones[i]
zone_values = _get_zone_values(values_by_zones, start, end)
if len(values_block.shape) == 2:
_single_zone_crosstab_2d(
zone_values, unique_cats, cat_ids, nodata_values, results
)
else:
_single_zone_crosstab_3d(
zone_values, unique_cats, cat_ids, nodata_values, results, _DEFAULT_STATS['count'] # noqa
)
start = end
if TOTAL_COUNT in results:
results[TOTAL_COUNT] = np.array(results[TOTAL_COUNT], dtype=np.float32)
for cat in cat_ids:
results[cat] = np.array(results[cat])
return results
@delayed
def _select_ids(unique_ids, ids):
selected_ids = []
for i in ids:
if i in unique_ids:
selected_ids.append(i)
return selected_ids
@delayed
def _crosstab_df_dask(crosstab_by_block, zone_ids, cat_ids, agg):
result = crosstab_by_block[0]
for i in range(1, len(crosstab_by_block)):
for k in crosstab_by_block[i]:
result[k] += crosstab_by_block[i][k]
if agg == 'percentage':
# replace 0s with nans to avoid dividing by 0 error
result[TOTAL_COUNT][result[TOTAL_COUNT] == 0] = np.nan
for cat in cat_ids:
result[cat] = result[cat] / result[TOTAL_COUNT] * 100
df = pd.DataFrame(result)
df['zone'] = zone_ids
columns = ['zone'] + list(cat_ids)
df = df[columns]
return df
def _crosstab_dask_numpy(
zones: np.ndarray,
values: np.ndarray,
zone_ids: List[Union[int, float]],
unique_cats: np.ndarray,
cat_ids: Union[List, np.ndarray],
nodata_values: Union[int, float],
agg: str,
):
# find ids for all zones
unique_zones = np.unique(zones[np.isfinite(zones)])
if zone_ids is None:
zone_ids = unique_zones
else:
zone_ids = _select_ids(unique_zones, zone_ids)
cat_ids = _select_ids(unique_cats, cat_ids)
zones_blocks = zones.to_delayed().ravel()
values_blocks = values.to_delayed().ravel()
crosstab_by_block = [
_single_chunk_crosstab(
z, v, unique_zones, zone_ids,
unique_cats, cat_ids, nodata_values
)
for z, v in zip(zones_blocks, values_blocks)
]
crosstab_df = _crosstab_df_dask(
crosstab_by_block, zone_ids, cat_ids, agg
)
return dd.from_delayed(crosstab_df)
[docs]def crosstab(
zones: xr.DataArray,
values: xr.DataArray,
zone_ids: List[Union[int, float]] = None,
cat_ids: List[Union[int, float]] = None,
layer: Optional[int] = None,
agg: Optional[str] = "count",
nodata_values: Optional[Union[int, float]] = None,
) -> Union[pd.DataFrame, dd.DataFrame]:
"""
Calculate cross-tabulated (categorical stats) areas
between two datasets: a zone dataset `zones`, a value dataset `values`
(a value raster). Infinite and NaN values in `zones` and `values` will
be ignored.
Outputs a pandas DataFrame if `zones` and `values` are numpy backed.
Outputs a dask DataFrame if `zones` and `values` are dask with
numpy-backed xarray DataArrays.
Requires a DataArray with a single data dimension, here called the
"values", indexed using either 2D or 3D coordinates.
DataArrays with 3D coordinates are expected to contain values
distributed over different categories that are indexed by the
additional coordinate. Such an array would reduce to the
2D-coordinate case if collapsed across the categories (e.g. if one
did ``aggc.sum(dim='cat')`` for a categorical dimension ``cat``).
Parameters
----------
zones : xr.DataArray
2D data array of integers or floats.
A zone is all the cells in a raster that have the same value,
whether or not they are contiguous. The input `zones` raster defines
the shape, values, and locations of the zones. An unique field
in the zone input is specified to define the zones.
values : xr.DataArray
2D or 3D data array of integers or floats.
The input value raster contains the input values used in
calculating the categorical statistic for each zone.
zone_ids: List of ints, or floats
List of zones to be included in calculation. If no zone_ids provided,
all zones will be used.
cat_ids: List of ints, or floats
List of categories to be included in calculation.
If no cat_ids provided, all categories will be used.
layer: int, default=0
index of the categorical dimension layer inside the `values` DataArray.
agg: str, default = 'count'
Aggregation method.
If the `values` data is 2D, available options are: `percentage`, and `count`.
If `values` is 3D and is numpy-backed, available options are:
`min`, `max`, `mean`, `sum`, `std`, `var`, and `count`.
If `values` is 3D and is dask with numpy-backed, the only available option is `count`.
nodata_values: int, float, default=None
Nodata value in `values` raster.
Cells with `nodata` do not belong to any zone,
and thus excluded from calculation.
Returns
-------
crosstab_df : Union[pandas.DataFrame, dask.dataframe.DataFrame]
A pandas DataFrame, or an uncomputed dask DataFrame,
where each column is a categorical value and each row is a zone
with zone id. Each entry presents the statistics, which computed
using the specified aggregation method, of the category over the zone.
Examples
--------
crosstab() works with NumPy backed DataArray.
.. sourcecode:: python
>>> import numpy as np
>>> import xarray as xr
>>> from xrspatial.zonal import crosstab
>>> values_data = np.asarray([
[0, 0, 10, 20],
[0, 0, 0, 10],
[0, np.nan, 20, 50],
[10, 30, 40, np.inf],
[10, 10, 50, 0]])
>>> values = xr.DataArray(values_data)
>>> zones_data = np.asarray([
[1, 1, 6, 6],
[1, np.nan, 6, 6],
[3, 5, 6, 6],
[3, 5, 7, np.nan],
[3, 7, 7, 0]])
>>> zones = xr.DataArray(zones_data)
>>> # Calculate Crosstab, numpy case
>>> df = crosstab(zones=zones, values=values)
>>> print(df)
zone 0.0 10.0 20.0 30.0 40.0 50.0
0 0 1 0 0 0 0 0
1 1 3 0 0 0 0 0
2 3 1 2 0 0 0 0
3 5 0 0 0 1 0 0
4 6 1 2 2 0 0 1
5 7 0 1 0 0 1 1
crosstab() works with Dask with NumPy backed DataArray.
.. sourcecode:: python
>>> import dask.array as da
>>> values_dask = xr.DataArray(da.from_array(values_data, chunks=(3, 3)))
>>> zones_dask = xr.DataArray(da.from_array(zones_data, chunks=(3, 3)))
>>> dask_df = crosstab(zones=zones_dask, values=values_dask)
>>> print(dask_df)
Dask DataFrame Structure:
zone 0.0 10.0 20.0 30.0 40.0 50.0
npartitions=5
0 float64 int64 int64 int64 int64 int64 int64
1 ... ... ... ... ... ... ...
... ... ... ... ... ... ... ...
4 ... ... ... ... ... ... ...
5 ... ... ... ... ... ... ...
Dask Name: astype, 1186 tasks
>>> print(dask_df.compute())
zone 0.0 10.0 20.0 30.0 40.0 50.0
0 0 1 0 0 0 0 0
1 1 3 0 0 0 0 0
2 3 1 2 0 0 0 0
3 5 0 0 0 1 0 0
4 6 1 2 2 0 0 1
5 7 0 1 0 0 1 1
"""
if not isinstance(zones, xr.DataArray):
raise TypeError("zones must be instance of DataArray")
if not isinstance(values, xr.DataArray):
raise TypeError("values must be instance of DataArray")
if zones.ndim != 2:
raise ValueError("zones must be 2D")
if not (
issubclass(zones.data.dtype.type, np.integer)
or issubclass(zones.data.dtype.type, np.floating)
):
raise ValueError("`zones` must be an xarray of integers or floats")
if not issubclass(values.data.dtype.type, np.integer) and not issubclass(
values.data.dtype.type, np.floating
):
raise ValueError("`values` must be an xarray of integers or floats")
if values.ndim not in [2, 3]:
raise ValueError("`values` must use either 2D or 3D coordinates.")
agg_2d = ["percentage", "count"]
agg_3d_numpy = _DEFAULT_STATS.keys()
agg_3d_dask = ["count"]
if values.ndim == 2 and agg not in agg_2d:
raise ValueError(
f"`agg` method for 2D data array must be one of following {agg_2d}"
)
if values.ndim == 3:
if isinstance(values.data, np.ndarray) and agg not in agg_3d_numpy:
raise ValueError(
f"`agg` method for 3D numpy backed data array must be one of following {agg_3d_numpy}" # noqa
)
if isinstance(values.data, da.Array) and agg not in agg_3d_dask:
raise ValueError(
f"`agg` method for 3D dask backed data array must be one of following {agg_3d_dask}"
)
if len(values.shape) == 3:
# 3D case
if layer is None:
layer = 0
try:
values.indexes[values.dims[layer]].values
except (IndexError, KeyError):
raise ValueError("Invalid `layer`")
dims = values.dims
reshape_dims = [dims[layer]] + [d for d in dims if d != dims[layer]]
# transpose by that category dimension
values = values.transpose(*reshape_dims)
if zones.shape != values.shape[1:]:
raise ValueError("Incompatible shapes")
if isinstance(values.data, da.Array):
# dask case, rechunk if necessary
zones_chunks = zones.chunks
expected_values_chunks = {
0: (values.shape[0],),
1: zones_chunks[0],
2: zones_chunks[1],
}
actual_values_chunks = {
i: values.chunks[i] for i in range(3)
}
if actual_values_chunks != expected_values_chunks:
values.data = values.data.rechunk(expected_values_chunks)
# find categories
unique_cats, cat_ids = _find_cats(values, cat_ids, nodata_values)
mapper = ArrayTypeFunctionMapping(
numpy_func=_crosstab_numpy,
dask_func=_crosstab_dask_numpy,
cupy_func=lambda *args: not_implemented_func(
*args, messages='crosstab() does not support cupy backed DataArray'
),
dask_cupy_func=lambda *args: not_implemented_func(
*args, messages='crosstab() does not support dask with cupy backed DataArray' # noqa
),
)
crosstab_df = mapper(values)(
zones.data, values.data,
zone_ids, unique_cats, cat_ids, nodata_values, agg
)
return crosstab_df
[docs]def apply(
zones: xr.DataArray,
values: xr.DataArray,
func: Callable,
nodata: Optional[int] = 0
):
"""
Apply a function to the `values` agg within zones in `zones` agg.
Change the agg content.
Parameters
----------
zones : xr.DataArray
zones.values is a 2d array of integers. A zone is all the cells
in a raster that have the same value, whether or not they are
contiguous. The input zone layer defines the shape, values, and
locations of the zones. An integer field in the zone input is
specified to define the zones.
agg : xr.DataArray
agg.values is either a 2D or 3D array of integers or floats.
The input value raster.
func : callable function to apply.
nodata: int, default=None
Nodata value in `zones` raster.
Cells with `nodata` does not belong to any zone,
and thus excluded from calculation.
Examples
--------
.. sourcecode:: python
>>> import numpy as np
>>> import xarray as xr
>>> from xrspatial.zonal import apply
>>> zones_val = np.array([
[1, 1, 0, 2],
[0, 2, 1, 2]])
>>> zones = xr.DataArray(zones_val)
>>> values_val = np.array([
[2, -1, 5, 3],
[3, np.nan, 20, 10]])
>>> agg = xr.DataArray(values_val)
>>> func = lambda x: 0
>>> apply(zones, agg, func)
>>> agg
array([[0, 0, 5, 0],
[3, np.nan, 0, 0]])
"""
if not isinstance(zones, xr.DataArray):
raise TypeError("zones must be instance of DataArray")
if not isinstance(values, xr.DataArray):
raise TypeError("values must be instance of DataArray")
if zones.ndim != 2:
raise ValueError("zones must be 2D")
if values.ndim != 2 and values.ndim != 3:
raise ValueError("values must be either 2D or 3D coordinates")
if zones.shape != values.shape[:2]:
raise ValueError("Incompatible shapes between `zones` and `values`")
if not issubclass(zones.values.dtype.type, np.integer):
raise ValueError("`zones.values` must be an array of integers")
if not (
issubclass(values.values.dtype.type, np.integer)
or issubclass(values.values.dtype.type, np.floating)
):
raise ValueError("`values` must be an array of integers or float")
# entries of nodata remain the same
remain_entries = zones.data == nodata
# entries with to be included in calculation
zones_entries = zones.data != nodata
if len(values.shape) == 3:
z = values.shape[-1]
# add new z-dimension in case 3D `values` aggregate
remain_entries = np.repeat(
remain_entries[:, :, np.newaxis],
z,
axis=-1
)
zones_entries = np.repeat(
zones_entries[:, :, np.newaxis],
z,
axis=-1
)
remain_mask = np.ma.masked_array(values.data, mask=remain_entries)
zones_mask = np.ma.masked_array(values.data, mask=zones_entries)
# apply func to corresponding `values` of `zones`
vfunc = np.vectorize(func)
values_func = vfunc(zones_mask)
values.values = (
remain_mask.data
* remain_mask.mask
+ values_func.data
* values_func.mask
)
[docs]def get_full_extent(crs):
"""
Returns the full extent of a map projection, available projections
are 'Mercator' and 'Geographic'.
Parameters
----------
crs : str
Input projection.
Returns
-------
extent : tuple
Projection extent of form ((min_x, max_x), (min_y, max_y)).
Examples
--------
.. sourcecode:: python
>>> from xrspatial.zonal import get_full_extent
>>> extent = get_full_extent('Mercator')
>>> print(extent)
((-20000000.0, 20000000.0), (-20000000.0, 20000000.0))
"""
Mercator = (-20e6, 20e6), (-20e6, 20e6)
Geographic = (-180, 180), (-90, 90)
def _crs_code_mapping():
CRS_CODES = {}
CRS_CODES["Mercator"] = Mercator
CRS_CODES["Geographic"] = Geographic
return CRS_CODES
CRS_CODES = _crs_code_mapping()
return CRS_CODES[crs]
[docs]def suggest_zonal_canvas(
smallest_area: Union[int, float],
x_range: Union[tuple, list],
y_range: Union[tuple, list],
crs: str = "Mercator",
min_pixels: int = 25,
) -> tuple:
"""
Given a coordinate reference system (crs), a set of polygons with
corresponding x range and y range, calculate the height and width
of canvas so that the smallest polygon (polygon with smallest area)
is rasterized with at least min pixels.
Currently, we assume that the smallest polygon does not intersect
others. One should note that a polygon can have different shapes
when it is rasterized in canvases of different size. Thus, we cannot
100% guarantee the actual number of pixels after rasterization.
It is recommended to add an additional of 5% to @min_pixels parameter.
Parameters
----------
x_range : tuple or list of float or int
The full x extent of the polygon GeoDataFrame.
y_range : tuple or list of float or int
The full y extent of the polygon GeoDataFrame.
smallest_area : float or int
Area of the smallest polygon.
crs : str, default='Mercator'
Name of the coordinate reference system.
min_pixels : int, default=25
Expected number of pixels of the polygon with smallest area
when the whole dataframe is rasterized.
Returns
-------
height, width: int
Height and width of the canvas in pixel space.
Examples
--------
.. sourcecode:: python
>>> # Imports
>>> from spatialpandas import GeoDataFrame
>>> import geopandas as gpd
>>> import datashader as ds
>>> from xrspatial.zonal import suggest_zonal_canvas
>>> df = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
>>> df = df.to_crs("EPSG:3857")
>>> df = df[df.continent != 'Antarctica']
>>> df['id'] = [i for i in range(len(df.index))]
>>> xmin, ymin, xmax, ymax = (
df.bounds.minx.min(),
df.bounds.miny.min(),
df.bounds.maxx.max(),
df.bounds.maxy.max(),
)
>>> x_range = (xmin, xmax)
>>> y_range = (ymin, ymax)
>>> smallest_area = df.area.min()
>>> min_pixels = 20
>>> height, width = suggest_zonal_canvas(
x_range=x_range,
y_range=y_range,
smallest_area=smallest_area,
crs='Mercator',
min_pixels=min_pixels,
)
>>> height, width
(1537, 2376)
>>> cvs = ds.Canvas(x_range=x_range, y_range=y_range,
>>> plot_height=height, plot_width=width)
>>> spatial_df = GeoDataFrame(df, geometry='geometry')
>>> agg = cvs.polygons(spatial_df, 'geometry', agg=ds.max('id'))
>>> min_poly_id = df.area.argmin()
>>> actual_min_pixels = len(np.where(agg.data==min_poly_id)[0])
>>> actual_min_pixels
22
"""
full_xrange, full_yrange = get_full_extent(crs)
xmin, xmax = full_xrange
ymin, ymax = full_yrange
aspect_ratio = (xmax - xmin) / (ymax - ymin)
# area that a pixel stands for
pixel_area = smallest_area / min_pixels
# total_area of whole GeoDataFrame
total_area = (xmax - xmin) * (ymax - ymin)
# total pixels needed
total_pixels = total_area / pixel_area
# We have, h * w = total_pixels
# and, w / h = aspect_ratio
# Thus, aspect_ratio * h**2 = total_pixels
h = sqrt(total_pixels / aspect_ratio)
w = aspect_ratio * h
canvas_h = int(h * (y_range[1] - y_range[0]) / (ymax - ymin))
canvas_w = int(w * (x_range[1] - x_range[0]) / (xmax - xmin))
return canvas_h, canvas_w
@ngjit
def _area_connectivity(data, n=4):
out = np.zeros_like(data)
rows, cols = data.shape
uid = 1
src_window = np.zeros(shape=(n,), dtype=data.dtype)
area_window = np.zeros(shape=(n,), dtype=data.dtype)
for y in range(0, rows):
for x in range(0, cols):
val = data[y, x]
if np.isnan(val):
out[y, x] = val
continue
if n == 8:
src_window[0] = data[max(y - 1, 0), max(x - 1, 0)]
src_window[1] = data[y, max(x - 1, 0)]
src_window[2] = data[min(y + 1, rows - 1), max(x - 1, 0)]
src_window[3] = data[max(y - 1, 0), x]
src_window[4] = data[min(y + 1, rows - 1), x]
src_window[5] = data[max(y - 1, 0), min(x + 1, cols - 1)]
src_window[6] = data[y, min(x + 1, cols - 1)]
src_window[7] = data[min(y + 1, rows - 1), min(x + 1, cols - 1)] # noqa
area_window[0] = out[max(y - 1, 0), max(x - 1, 0)]
area_window[1] = out[y, max(x - 1, 0)]
area_window[2] = out[min(y + 1, rows - 1), max(x - 1, 0)]
area_window[3] = out[max(y - 1, 0), x]
area_window[4] = out[min(y + 1, rows - 1), x]
area_window[5] = out[max(y - 1, 0), min(x + 1, cols - 1)]
area_window[6] = out[y, min(x + 1, cols - 1)]
area_window[7] = out[min(y + 1, rows - 1), min(x + 1, cols - 1)] # noqa
else:
src_window[0] = data[y, max(x - 1, 0)]
src_window[1] = data[max(y - 1, 0), x]
src_window[2] = data[min(y + 1, rows - 1), x]
src_window[3] = data[y, min(x + 1, cols - 1)]
area_window[0] = out[y, max(x - 1, 0)]
area_window[1] = out[max(y - 1, 0), x]
area_window[2] = out[min(y + 1, rows - 1), x]
area_window[3] = out[y, min(x + 1, cols - 1)]
# check in has matching value in neighborhood
rtol = 1e-05
atol = 1e-08
is_close = np.abs(src_window - val) <= (atol + rtol * np.abs(val))
neighbor_matches = np.where(is_close)[0]
if len(neighbor_matches) > 0:
# check in has area already assigned
assigned_value = None
for j in range(len(neighbor_matches)):
area_val = area_window[neighbor_matches[j]]
if area_val > 0:
assigned_value = area_val
break
if assigned_value is not None:
out[y, x] = assigned_value
else:
out[y, x] = uid
uid += 1
else:
out[y, x] = uid
uid += 1
for y in range(0, rows):
for x in range(0, cols):
if n == 8:
src_window[0] = data[max(y - 1, 0), max(x - 1, 0)]
src_window[1] = data[y, max(x - 1, 0)]
src_window[2] = data[min(y + 1, rows - 1), max(x - 1, 0)]
src_window[3] = data[max(y - 1, 0), x]
src_window[4] = data[min(y + 1, rows - 1), x]
src_window[5] = data[max(y - 1, 0), min(x + 1, cols - 1)]
src_window[6] = data[y, min(x + 1, cols - 1)]
src_window[7] = data[min(y + 1, rows - 1), min(x + 1, cols - 1)] # noqa
area_window[0] = out[max(y - 1, 0), max(x - 1, 0)]
area_window[1] = out[y, max(x - 1, 0)]
area_window[2] = out[min(y + 1, rows - 1), max(x - 1, 0)]
area_window[3] = out[max(y - 1, 0), x]
area_window[4] = out[min(y + 1, rows - 1), x]
area_window[5] = out[max(y - 1, 0), min(x + 1, cols - 1)]
area_window[6] = out[y, min(x + 1, cols - 1)]
area_window[7] = out[min(y + 1, rows - 1), min(x + 1, cols - 1)] # noqa
else:
src_window[0] = data[y, max(x - 1, 0)]
src_window[1] = data[max(y - 1, 0), x]
src_window[2] = data[min(y + 1, rows - 1), x]
src_window[3] = data[y, min(x + 1, cols - 1)]
area_window[0] = out[y, max(x - 1, 0)]
area_window[1] = out[max(y - 1, 0), x]
area_window[2] = out[min(y + 1, rows - 1), x]
area_window[3] = out[y, min(x + 1, cols - 1)]
val = data[y, x]
if np.isnan(val):
continue
# check in has matching value in neighborhood
rtol = 1e-05
atol = 1e-08
is_close = np.abs(src_window - val) <= (atol + rtol * np.abs(val))
neighbor_matches = np.where(is_close)[0]
# check in has area already assigned
assigned_values_min = None
for j in range(len(neighbor_matches)):
area_val = area_window[neighbor_matches[j]]
nn = assigned_values_min is not None
if nn and assigned_values_min != area_val:
if assigned_values_min > area_val:
# replace
for y1 in range(0, rows):
for x1 in range(0, cols):
if out[y1, x1] == assigned_values_min:
out[y1, x1] = area_val
assigned_values_min = area_val
else:
# replace
for y1 in range(0, rows):
for x1 in range(0, cols):
if out[y1, x1] == area_val:
out[y1, x1] = assigned_values_min
elif assigned_values_min is None:
assigned_values_min = area_val
return out
[docs]def regions(
raster: xr.DataArray, neighborhood: int = 4, name: str = "regions"
) -> xr.DataArray:
"""
Create unique regions of raster based on pixel value connectivity.
Connectivity can be based on either 4 or 8-pixel neighborhoods.
Output raster contain a unique int for each connected region.
Parameters
----------
raster : xr.DataArray
connections : int, default=4
4 or 8 pixel-based connectivity.
name: str, default='regions'
output xr.DataArray.name property.
Returns
-------
regions_agg : xarray.DataArray
References
----------
- Tomislav Hengl: http://spatial-analyst.net/ILWIS/htm/ilwisapp/areanumbering_algorithm.htm # noqa
Examples
--------
.. plot::
:include-source:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
from xrspatial import generate_terrain
from xrspatial.zonal import regions
# Generate Example Terrain
W = 500
H = 300
template_terrain = xr.DataArray(np.zeros((H, W)))
x_range=(-20e6, 20e6)
y_range=(-20e6, 20e6)
terrain_agg = generate_terrain(
template_terrain, x_range=x_range, y_range=y_range
)
# Edit Attributes
terrain_agg = terrain_agg.assign_attrs(
{
'Description': 'Example Terrain',
'units': 'km',
'Max Elevation': '4000',
}
)
terrain_agg = terrain_agg.rename({'x': 'lon', 'y': 'lat'})
terrain_agg = terrain_agg.rename('Elevation')
# Create Regions
regions_agg = regions(terrain_agg)
# Edit Attributes
regions_agg = regions_agg.assign_attrs({'Description': 'Example Regions',
'units': ''})
regions_agg = regions_agg.rename('Region Value')
# Plot Terrain (Values)
terrain_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Terrain (Values)")
plt.ylabel("latitude")
plt.xlabel("longitude")
# Plot Regions
regions_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Regions")
plt.ylabel("latitude")
plt.xlabel("longitude")
.. sourcecode:: python
>>> print(terrain_agg[200:203, 200:202])
<xarray.DataArray 'Elevation' (lat: 3, lon: 2)>
array([[1264.02296597, 1261.947921 ],
[1285.37105519, 1282.48079719],
[1306.02339636, 1303.4069579 ]])
Coordinates:
* lon (lon) float64 -3.96e+06 -3.88e+06
* lat (lat) float64 6.733e+06 6.867e+06 7e+06
Attributes:
res: (80000.0, 133333.3333333333)
Description: Example Terrain
units: km
Max Elevation: 4000
>>> print(regions_agg[200:203, 200:202])
<xarray.DataArray 'Region Value' (lat: 3, lon: 2)>
array([[39557., 39558.],
[39943., 39944.],
[40327., 40328.]])
Coordinates:
* lon (lon) float64 -3.96e+06 -3.88e+06
* lat (lat) float64 6.733e+06 6.867e+06 7e+06
Attributes:
res: (80000.0, 133333.3333333333)
Description: Example Regions
units:
Max Elevation: 4000
"""
if neighborhood not in (4, 8):
raise ValueError("`neighborhood` value must be either 4 or 8)")
out = _area_connectivity(raster.data, n=neighborhood)
return DataArray(
out,
name=name,
dims=raster.dims,
coords=raster.coords,
attrs=raster.attrs
)
def _bool_crop(arr, rows_flags, cols_flags):
top = np.argwhere(rows_flags).flatten()[0]
bottom = np.argwhere(rows_flags).flatten()[-1]
left = np.argwhere(cols_flags).flatten()[0]
right = np.argwhere(cols_flags).flatten()[-1]
return arr[top: bottom + 1, left: right + 1]
@ngjit
def _trim(data, excludes):
rows, cols = data.shape
# find empty top rows
top = 0
scan_complete = False
for y in range(rows):
if scan_complete:
break
top = y
for x in range(cols):
val = data[y, x]
is_nodata = False
for e in excludes:
if e == val:
is_nodata = True
break
if not is_nodata:
scan_complete = True
break
# find empty bottom rows
bottom = 0
scan_complete = False
for y in range(rows - 1, -1, -1):
if scan_complete:
break
bottom = y
for x in range(cols):
val = data[y, x]
is_nodata = False
for e in excludes:
if e == val:
is_nodata = True
break
if not is_nodata:
scan_complete = True
break
# find empty left cols
left = 0
scan_complete = False
for x in range(cols):
if scan_complete:
break
left = x
for y in range(rows):
val = data[y, x]
is_nodata = False
for e in excludes:
if e == val:
is_nodata = True
break
if not is_nodata:
scan_complete = True
break
# find empty right cols
right = 0
scan_complete = False
for x in range(cols - 1, -1, -1):
if scan_complete:
break
right = x
for y in range(rows):
val = data[y, x]
is_nodata = False
for e in excludes:
if e == val:
is_nodata = True
break
if not is_nodata:
scan_complete = True
break
return top, bottom, left, right
[docs]def trim(
raster: xr.DataArray,
values: Union[list, tuple] = (np.nan,),
name: str = "trim"
) -> xr.DataArray:
"""
Trim scans from the edges and eliminates rows / cols which only
contain the values supplied.
Parameters
----------
raster: xr.DataArray
values: list or tuple, default=(np.nan)
List of zone ids to trim from raster edge.
name: str, default='trim'
Output xr.DataArray.name property.
Returns
-------
trim_agg : xarray.DataArray
Notes
-----
- This operation will change the output size of the raster.
Examples
--------
.. plot::
:include-source:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
from xrspatial import generate_terrain
from xrspatial.zonal import trim
# Generate Example Terrain
W = 500
H = 300
template_terrain = xr.DataArray(np.zeros((H, W)))
x_range=(-20e6, 20e6)
y_range=(-20e6, 20e6)
terrain_agg = generate_terrain(
template_terrain, x_range=x_range, y_range=y_range
)
# Edit Attributes
terrain_agg = terrain_agg.assign_attrs(
{
'Description': 'Example Terrain',
'units': 'km',
'Max Elevation': '4000',
}
)
terrain_agg = terrain_agg.rename({'x': 'lon', 'y': 'lat'})
terrain_agg = terrain_agg.rename('Elevation')
terrain_agg = terrain_agg.astype('int')
# Trim Image
trimmed_agg = trim(raster = terrain_agg, values = [0])
# Edit Attributes
trimmed_agg = trimmed_agg.assign_attrs({'Description': 'Example Trim'})
# Plot Terrain
terrain_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Terrain")
plt.ylabel("latitude")
plt.xlabel("longitude")
# Plot Trimmed Terrain
trimmed_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Trim")
plt.ylabel("latitude")
plt.xlabel("longitude")
.. sourcecode:: python
>>> print(terrain_agg.shape)
(300, 500)
>>> print(terrain_agg.attrs)
{
'res': (80000.0, 133333.3333333333),
'Description': 'Example Terrain',
'units': 'km',
'Max Elevation': '4000',
}
>>> print(trimmed_agg.shape)
(268, 500)
>>> print(trimmed_agg.attrs)
{
'res': (80000.0, 133333.3333333333),
'Description': 'Example Trim',
'units': 'km',
'Max Elevation': '4000',
}
"""
top, bottom, left, right = _trim(raster.data, values)
arr = raster[top: bottom + 1, left: right + 1]
arr.name = name
return arr
@ngjit
def _crop(data, values):
rows, cols = data.shape
top = -1
bottom = -1
left = -1
right = -1
# find empty top rows
top = 0
scan_complete = False
for y in range(rows):
if scan_complete:
break
top = y
for x in range(cols):
val = data[y, x]
for v in values:
if v == val:
scan_complete = True
break
else:
continue
if scan_complete:
break
# find empty bottom rows
bottom = 0
scan_complete = False
for y in range(rows - 1, -1, -1):
if scan_complete:
break
bottom = y
for x in range(cols):
val = data[y, x]
for e in values:
if e == val:
scan_complete = True
break
else:
continue
if scan_complete:
break
# find empty left cols
left = 0
scan_complete = False
for x in range(cols):
if scan_complete:
break
left = x
for y in range(rows):
val = data[y, x]
for e in values:
if e == val:
scan_complete = True
break
else:
continue
if scan_complete:
break
# find empty right cols
right = 0
scan_complete = False
for x in range(cols - 1, -1, -1):
if scan_complete:
break
right = x
for y in range(rows):
val = data[y, x]
for e in values:
if e == val:
scan_complete = True
break
else:
continue
if scan_complete:
break
return top, bottom, left, right
[docs]def crop(
zones: xr.DataArray,
values: xr.DataArray,
zones_ids: Union[list, tuple],
name: str = "crop",
):
"""
Crop scans from edges and eliminates rows / cols until one of the
input values is found.
Parameters
----------
zones : xr.DataArray
Input zone raster.
values: xr.DataArray
Input values raster.
zones_ids : list or tuple
List of zone ids to crop raster.
name: str, default='crop'
Output xr.DataArray.name property.
Returns
-------
crop_agg : xarray.DataArray
Notes
-----
- This operation will change the output size of the raster.
Examples
--------
.. plot::
:include-source:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
from xrspatial import generate_terrain
from xrspatial.zonal import crop
# Generate Example Terrain
W = 500
H = 300
template_terrain = xr.DataArray(np.zeros((H, W)))
x_range=(-20e6, 20e6)
y_range=(-20e6, 20e6)
terrain_agg = generate_terrain(
template_terrain, x_range=x_range, y_range=y_range
)
# Edit Attributes
terrain_agg = terrain_agg.assign_attrs(
{
'Description': 'Example Terrain',
'units': 'km',
'Max Elevation': '4000',
}
)
terrain_agg = terrain_agg.rename({'x': 'lon', 'y': 'lat'})
terrain_agg = terrain_agg.rename('Elevation')
# Crop Image
values_agg = terrain_agg[0:300, 0:250]
cropped_agg = crop(
zones=terrain_agg,
values=values_agg,
zones_ids=[0],
)
# Edit Attributes
cropped_agg = cropped_agg.assign_attrs({'Description': 'Example Crop'})
# Plot Terrain
terrain_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Terrain")
plt.ylabel("latitude")
plt.xlabel("longitude")
# Plot Cropped Terrain
cropped_agg.plot(cmap = 'terrain', aspect = 2, size = 4)
plt.title("Crop")
plt.ylabel("latitude")
plt.xlabel("longitude")
.. sourcecode:: python
>>> print(terrain_agg.shape)
(300, 500)
>>> print(terrain_agg.attrs)
{
'res': (80000.0, 133333.3333333333),
'Description': 'Example Terrain',
'units': 'km',
'Max Elevation': '4000',
}
>>> print(cropped_agg.shape)
(300, 250)
>>> print(cropped_agg.attrs)
{
'res': (80000.0, 133333.3333333333),
'Description': 'Example Crop',
'units': 'km',
'Max Elevation': '4000',
}
"""
top, bottom, left, right = _crop(zones.data, zones_ids)
arr = values[top: bottom + 1, left: right + 1]
arr.name = name
return arr