ml module¶
Module for machine learning with Google Earth Engine.
csv_to_classifier(in_csv)
¶
Convert a CSV file containing a list of strings (an ensemble of decision trees) to an ee.Classifier.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
in_csv |
str |
File path to the input CSV. |
required |
Returns:
Type | Description |
---|---|
object |
ee.Classifier. |
Source code in geemap/ml.py
def csv_to_classifier(in_csv):
"""Convert a CSV file containing a list of strings (an ensemble of decision trees) to an ee.Classifier.
Args:
in_csv (str): File path to the input CSV.
Returns:
object: ee.Classifier.
"""
in_csv = os.path.join(in_csv)
try:
with open(in_csv) as f:
lines = f.readlines()
except FileNotFoundError:
print(f"{in_csv} could not be found.")
return None
null_island = ee.Geometry.Point([0, 0])
features = [ee.Feature(null_island, {"tree": line.strip()}) for line in lines]
rf_fc = ee.FeatureCollection(features)
classifier = fc_to_classifier(rf_fc)
return classifier
export_trees_to_fc(trees, asset_id, description='geemap_rf_export')
¶
Function that creates a feature collection with a property tree which contains the string representation of decision trees and exports to ee asset for later use
Parameters:
Name | Type | Description | Default |
---|---|---|---|
trees |
list[str] |
list of string representation of the decision trees |
required |
asset_id |
str |
ee asset id path to export the feature collection to |
required |
Kwargs
description (str): optional description to provide export information. default = "geemap_rf_export"
Source code in geemap/ml.py
def export_trees_to_fc(trees, asset_id, description="geemap_rf_export"):
"""Function that creates a feature collection with a property tree which contains the string representation of decision trees and exports to ee asset for later use
args:
trees (list[str]): list of string representation of the decision trees
asset_id (str): ee asset id path to export the feature collection to
kwargs:
description (str): optional description to provide export information. default = "geemap_rf_export"
"""
# create a null geometry point. This is needed to properly export the feature collection
null_island = ee.Geometry.Point([0, 0])
# create a list of feature over null island
# set the tree property as the tree string
# encode return values (\n) as #, use to parse later
features = [
ee.Feature(null_island, {"tree": tree.replace("\n", "#")}) for tree in trees
]
# cast as feature collection
fc = ee.FeatureCollection(features)
# get export task and start
task = ee.batch.Export.table.toAsset(
collection=fc, description=description, assetId=asset_id
)
task.start()
fc_to_classifier(fc)
¶
Function that takes a feature collection resulting from export_trees_to_fc
and creates a ee.Classifier that can be used with ee objects
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fc |
ee.FeatureCollection |
feature collection that has trees property for each feature that represents the decision tree |
required |
Returns:
Type | Description |
---|---|
classifier (ee.Classifier) |
ee classifier object representing an ensemble decision tree |
Source code in geemap/ml.py
def fc_to_classifier(fc):
"""Function that takes a feature collection resulting from `export_trees_to_fc` and creates a ee.Classifier that can be used with ee objects
args:
fc (ee.FeatureCollection): feature collection that has trees property for each feature that represents the decision tree
returns:
classifier (ee.Classifier): ee classifier object representing an ensemble decision tree
"""
# get a list of tree strings from feature collection
tree_strings = fc.aggregate_array("tree").map(
lambda x: ee.String(x).replace(
"#", "\n", "g"
) # expects that # is ecoded to be a return
)
# pass list of ee.Strings to an ensemble decision tree classifier (i.e. RandomForest)
classifier = ee.Classifier.decisionTreeEnsemble(tree_strings)
return classifier
rf_to_strings(estimator, feature_names, processes=2, output_mode='INFER')
¶
Function to convert a ensemble of decision trees into a list of strings. Wraps tree_to_string
Parameters:
Name | Type | Description | Default |
---|---|---|---|
estimator |
sklearn.ensemble.estimator |
A decision tree classifier or regressor object created using sklearn |
required |
feature_names |
list[str] |
List of strings that define the name of features (i.e. bands) used to create the model |
required |
processes |
int |
number of cpu processes to spawn. Increasing processes will improve speed for large models. default = 2 |
2 |
output_mode |
str |
the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER" |
'INFER' |
Returns:
Type | Description |
---|---|
trees (list[str]) |
list of strings where each string represents a decision tree estimator and collectively represent an ensemble decision tree estimator (i.e. RandomForest) |
Source code in geemap/ml.py
def rf_to_strings(estimator, feature_names, processes=2, output_mode="INFER"):
"""Function to convert a ensemble of decision trees into a list of strings. Wraps `tree_to_string`
args:
estimator (sklearn.ensemble.estimator): A decision tree classifier or regressor object created using sklearn
feature_names (list[str]): List of strings that define the name of features (i.e. bands) used to create the model
processes (int): number of cpu processes to spawn. Increasing processes will improve speed for large models. default = 2
output_mode (str): the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"
returns:
trees (list[str]): list of strings where each string represents a decision tree estimator and collectively represent an ensemble decision tree estimator (i.e. RandomForest)
"""
# force output mode to be capital
output_mode = output_mode.upper()
available_modes = ["INFER", "CLASSIFICATION", "REGRESSION", "PROBABILITY"]
if output_mode not in available_modes:
raise ValueError(
f"The provided output_mode is not available, please provide one from the following list: {available_modes}"
)
# extract out the estimator trees
estimators = np.squeeze(estimator.estimators_)
if output_mode == "INFER":
if estimator.criterion in ["gini", "entropy"]:
class_labels = estimator.classes_
elif estimator.criterion in ["mse", "mae"]:
class_labels = None
else:
raise RuntimeError(
"Could not infer the output type from the estimator, please explicitly provide the output_mode "
)
elif output_mode == "CLASSIFICATION":
class_labels = estimator.classes_
else:
class_labels = None
# check that number of processors set to use is not more than available
if processes >= mp.cpu_count():
# if so, force to use only cpu count - 1
processes = mp.cpu_count() - 1
# run the tree extraction process in parallel
with mp.Pool(processes) as pool:
proc = pool.map_async(
partial(
tree_to_string,
feature_names=feature_names,
labels=class_labels,
output_mode=output_mode,
),
estimators,
)
trees = list(proc.get())
return trees
strings_to_classifier(trees)
¶
Function that takes string representation of decision trees and creates a ee.Classifier that can be used with ee objects
Parameters:
Name | Type | Description | Default |
---|---|---|---|
trees |
list[str] |
list of string representation of the decision trees |
required |
Returns:
Type | Description |
---|---|
classifier (ee.Classifier) |
ee classifier object representing an ensemble decision tree |
Source code in geemap/ml.py
def strings_to_classifier(trees):
"""Function that takes string representation of decision trees and creates a ee.Classifier that can be used with ee objects
args:
trees (list[str]): list of string representation of the decision trees
returns:
classifier (ee.Classifier): ee classifier object representing an ensemble decision tree
"""
# convert strings to ee.String objects
ee_strings = [ee.String(tree) for tree in trees]
# pass list of ee.Strings to an ensemble decision tree classifier (i.e. RandomForest)
classifier = ee.Classifier.decisionTreeEnsemble(ee_strings)
return classifier
tree_to_string(estimator, feature_names, labels=None, output_mode='INFER')
¶
Function to convert a sklearn decision tree object to a string format that EE can interpret
Parameters:
Name | Type | Description | Default |
---|---|---|---|
estimator |
sklearn.tree.estimator |
An estimator consisting of multiple decision tree classifiers. Expects object to contain estimators_ attribute |
required |
feature_names |
Iterable[str] |
List of strings that define the name of features (i.e. bands) used to create the model |
required |
labels |
Iterable[numeric] |
List of class labels to map outputs to, must be numeric values. If None, then raw outputs will be used. default = None |
None |
output_mode |
str |
the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER" |
'INFER' |
Returns:
Type | Description |
---|---|
tree_str (str) |
string representation of decision tree estimator |
Exceptions:
Type | Description |
---|---|
RuntimeError |
raises run time error when function cannot determine if the estimator is for regression or classification problem |
Source code in geemap/ml.py
def tree_to_string(estimator, feature_names, labels=None, output_mode="INFER"):
"""Function to convert a sklearn decision tree object to a string format that EE can interpret
args:
estimator (sklearn.tree.estimator): An estimator consisting of multiple decision tree classifiers. Expects object to contain estimators_ attribute
feature_names (Iterable[str]): List of strings that define the name of features (i.e. bands) used to create the model
labels (Iterable[numeric]): List of class labels to map outputs to, must be numeric values. If None, then raw outputs will be used. default = None
output_mode (str): the output mode of the estimator. Options are "INFER", "CLASSIFIATION", or "REGRESSION" (capitalization does not matter). default = "INFER"
returns:
tree_str (str): string representation of decision tree estimator
raises:
RuntimeError: raises run time error when function cannot determine if the estimator is for regression or classification problem
"""
# extract out the information need to build the tree string
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature_idx = estimator.tree_.feature
impurities = estimator.tree_.impurity
n_samples = estimator.tree_.n_node_samples
thresholds = estimator.tree_.threshold
features = [feature_names[i] for i in feature_idx]
raw_vals = np.squeeze(estimator.tree_.value)
# first check if user wants to infer output mode
# if so, reset the output_mode variable to a valid mode
if output_mode == "INFER":
if raw_vals.ndim == 2:
output_mode = "CLASSIFICATION"
elif raw_vals.ndim == 1:
output_mode = "REGRESSION"
else:
raise RuntimeError(
"Could not infer the output type from the estimator, please explicitly provide the output_mode "
)
# second check on the output mode after the inference
if output_mode == "CLASSIFICATION":
# take argmax along class axis from values
values = raw_vals.argmax(axis=-1)
if labels is not None:
index_labels = np.unique(values)
lookup = {idx: labels[i] for i, idx in enumerate(index_labels)}
values = [lookup[v] for v in values]
out_type = int
elif output_mode == "REGRESSION":
# take values and drop un needed axis
values = np.around(raw_vals, decimals=6)
out_type = float
elif output_mode == "PROBABILITY":
# calculate fraction of samples of the same class in a leaf
# currently only supporting binary classifications
# check if n classes == 2 (i.e. binary classes)
if raw_vals.shape[-1] != 2:
raise ValueError(
"shape mismatch: outputs from trees = {raw_vals.shape[-1]} classes, currently probability outputs is support for binary classifications"
)
probas = np.around(
(raw_vals / np.sum(raw_vals, axis=1)[:, np.newaxis]), decimals=6
)
values = probas[:, -1]
out_type = float
elif output_mode == "MULTIPROBABILITY":
# calculate fraction of samples of the same class in a leaf
# this is a 2-d array making the output multidimensional
raise NotImplementedError(
"Currently multiprobability output is not support, please choose one of the following output modes: ['CLASSIFIATION', 'REGRESSION', 'PROBABILITY' or 'INFER']"
)
# probas = np.around(
# (raw_vals / np.sum(raw_vals,axis=1)[:,np.newaxis]),
# decimals=6
# )
# values = probas.tolist()
# out_type = list
else:
raise RuntimeError(
"could not understand estimator type and parse out the values"
)
# use iterative pre-order search to extract node depth and leaf information
node_ids = np.zeros(shape=n_nodes, dtype=np.int64)
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
node_ids[node_id] = node_id
# If we have a test node
if children_left[node_id] != children_right[node_id]:
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
# create a table of the initial structure
# each row is a node or leaf
df = pd.DataFrame(
{
"node_id": node_ids,
"node_depth": node_depth,
"is_leaf": is_leaves,
"children_left": children_left,
"children_right": children_right,
"value": values,
"criterion": impurities,
"n_samples": n_samples,
"threshold": thresholds,
"feature_name": features,
"sign": ["<="] * n_nodes,
},
dtype="object",
)
# the table representation does not have lef vs right node structure
# so we need to add in right nodes in the correct location
# we do this by first calculating which nodes are right and then insert them at the correct index
# get a dict of right node rows and assign key based on index where to insert
inserts = {}
for row in df.itertuples():
child_r = row.children_right
if child_r > row.Index:
ordered_row = np.array(row)
ordered_row[-1] = ">"
inserts[child_r] = ordered_row[1:] # drop index value
# sort the inserts as to keep track of the additive indexing
inserts_sorted = {k: inserts[k] for k in sorted(inserts.keys())}
# loop through the row inserts and add to table (array)
table_values = df.values
for i, k in enumerate(inserts_sorted.keys()):
table_values = np.insert(table_values, (k + i), inserts_sorted[k], axis=0)
# make the ordered table array into a dataframe
# note: df is dtype "object", need to cast later on
ordered_df = pd.DataFrame(table_values, columns=df.columns)
max_depth = np.max(ordered_df.node_depth.astype(int))
tree_str = f"1) root {n_samples[0]} 9999 9999 ({impurities.sum()})\n"
previous_depth = -1
cnts = []
# loop through the nodes and calculate the node number and values per node
for row in ordered_df.itertuples():
node_depth = int(row.node_depth)
left = int(row.children_left)
right = int(row.children_right)
if left != right:
if row.Index == 0:
cnt = 2
elif previous_depth > node_depth:
depths = ordered_df.node_depth.values[: row.Index]
idx = np.where(depths == node_depth)[0][-1]
# cnt = (cnts[row.Index-1] // 2) + 1
cnt = cnts[idx] + 1
elif previous_depth < node_depth:
cnt = cnts[row.Index - 1] * 2
elif previous_depth == node_depth:
cnt = cnts[row.Index - 1] + 1
if node_depth == (max_depth - 1):
value = out_type(ordered_df.iloc[row.Index + 1].value)
samps = int(ordered_df.iloc[row.Index + 1].n_samples)
criterion = float(ordered_df.iloc[row.Index + 1].criterion)
tail = " *\n"
else:
if (
(bool(ordered_df.loc[ordered_df.node_id == left].iloc[0].is_leaf))
and (
bool(
int(row.Index)
< int(ordered_df.loc[ordered_df.node_id == left].index[0])
)
)
and (str(row.sign) == "<=")
):
rowx = ordered_df.loc[ordered_df.node_id == left].iloc[0]
tail = " *\n"
value = out_type(rowx.value)
samps = int(rowx.n_samples)
criterion = float(rowx.criterion)
elif (
(bool(ordered_df.loc[ordered_df.node_id == right].iloc[0].is_leaf))
and (
bool(
int(row.Index)
< int(ordered_df.loc[ordered_df.node_id == right].index[0])
)
)
and (str(row.sign) == ">")
):
rowx = ordered_df.loc[ordered_df.node_id == right].iloc[0]
tail = " *\n"
value = out_type(rowx.value)
samps = int(rowx.n_samples)
criterion = float(rowx.criterion)
else:
value = out_type(row.value)
samps = int(row.n_samples)
criterion = float(row.criterion)
tail = "\n"
# extract out the information needed in each line
spacing = (node_depth + 1) * " " # for pretty printing
fname = str(row.feature_name) # name of the feature (i.e. band name)
tresh = float(row.threshold) # threshold
sign = str(row.sign)
tree_str += f"{spacing}{cnt}) {fname} {sign} {tresh:.6f} {samps} {criterion:.4f} {value}{tail}"
previous_depth = node_depth
cnts.append(cnt)
return tree_str
trees_to_csv(trees, out_csv)
¶
Save a list of strings (an ensemble of decision trees) to a CSV file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
trees |
list |
A list of strings (an ensemble of decision trees). |
required |
out_csv |
str |
File path to the output csv |
required |
Source code in geemap/ml.py
def trees_to_csv(trees, out_csv):
"""Save a list of strings (an ensemble of decision trees) to a CSV file.
Args:
trees (list): A list of strings (an ensemble of decision trees).
out_csv (str): File path to the output csv
"""
out_csv = os.path.abspath(out_csv)
with open(out_csv, "w") as f:
f.writelines([tree.replace("\n", "#") + "\n" for tree in trees])