# Importing libraries
import numpy as np
import pandas as pd
from numpy.random import dirichlet
from fedartml.function_base import jensen_shannon_distance, hellinger_distance, earth_movers_distance
[docs]
class SplitAsFederatedData:
"""
Creates federated data from the provided centralized data (features and labels) to exemplify identically and
non-identically distributed labels and features across the local nodes (clients). It allows one to select between
two methods of data federation (percent_noniid and dirichlet). It works only for classification problems
(labels as classes).
Parameters
----------
random_state : int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
References
----------
.. [1] (dirichlet) Tao Lin∗, Lingjing Kong∗, Sebastian U. Stich, Martin Jaggi. (2020). Ensemble Distillation for Robust Model Fusion in Federated Learning
https://proceedings.neurips.cc/paper/2020/file/18df51b97ccd68128e994804f3eccc87-Supplemental.pdf
.. [2] (percent_noniid) Hsieh, K., Phanishayee, A., Mutlu, O., & Gibbons, P. (2020, November). The non-iid data quagmire of decentralized machine learning. In International Conference on Machine Learning (pp. 4387-4398). PMLR.
https://proceedings.mlr.press/v119/hsieh20a/hsieh20a.pdf
"""
def __init__(self, random_state=None):
self.random_state = random_state
[docs]
@staticmethod
def percent_noniid_method(labels, local_nodes, pct_noniid=0, random_state=None):
"""
Create a federated dataset divided per each local node (client) using the Percentage of Non-IID (pctg_noniid)
method.
Parameters
----------
labels : array-like
The target values (class labels in classification).
local_nodes : int
Number of local nodes (clients) used in the federated learning paradigm.
pct_noniid : float
Percentage (between o and 100) desired of non-IID-ness for the federated data.
random_state : int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
Returns
-------
pctg_distr : array-like
Percentage (between 0 and 1) distribution of the classes for each local node (client).
num_distr : array-like
Numbers of distribution of the classes for each local node (client).
idx_distr : array-like
Indexes of examples (partition) taken for each local node (client).
num_per_node : array-like
Number of examples per each local node (client).
References
----------
.. [1] (percent_noniid) Hsieh, K., Phanishayee, A., Mutlu, O., & Gibbons, P. (2020, November). The non-iid data quagmire of decentralized machine learning. In International Conference on Machine Learning (pp. 4387-4398).PMLR.
https://proceedings.mlr.press/v119/hsieh20a/hsieh20a.pdf
Examples
--------
"""
# Get number of examples in the noniid part
n_noniid = int(len(labels) * (pct_noniid / 100))
# sorted_labels = sorted(labels)
sorted_labels = labels
noniid_part_sample = list(sorted_labels[0:n_noniid])
iid_part_sample = list(sorted_labels[n_noniid:len(labels)])
uniq_class_noniid = np.unique(noniid_part_sample)
n_class_per_node_noniid = len(uniq_class_noniid) // local_nodes
pctg_distr = []
num_distr = []
idx_distr = []
num_per_node = []
n_ini = 0
n_fin = n_class_per_node_noniid
n_total_iid = len(sorted_labels) - len(noniid_part_sample)
# Randomly assign each example to a local node (generate random numbers from 0 to local nodes)
np.random.seed(random_state)
rand_lnodes_iid = np.random.randint(0, local_nodes, size=n_total_iid)
# Get data for each local node
for i in range(local_nodes):
# Get examples for noniid and iid parts
aux_examples_node = [k for idx, k in enumerate(noniid_part_sample) if k in uniq_class_noniid[n_ini:n_fin]]
idx_aux_examples_node = [idx for idx, k in enumerate(noniid_part_sample) if
k in uniq_class_noniid[n_ini:n_fin]]
sample_iid = [lab for idx, (lab, loc_nod) in enumerate(zip(iid_part_sample, rand_lnodes_iid)) if
loc_nod == i]
idx_sample_iid = [idx + len(noniid_part_sample) for idx, (lab, loc_nod) in
enumerate(zip(iid_part_sample, rand_lnodes_iid)) if loc_nod == i]
aux_examples_node = aux_examples_node + sample_iid
idx_aux_examples_node = idx_aux_examples_node + idx_sample_iid
# Get distribution of labels
df_aux = pd.DataFrame(aux_examples_node, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(sorted_labels), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
num_per_node.append(list(df_node.label))
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr.append(list(df_node.perc))
num_distr.append(aux_examples_node)
idx_distr.append(idx_aux_examples_node)
# Increase values to consider next iteration
n_ini += n_class_per_node_noniid
# Check if the iteration corresponds to the previous-to-last to add all the remaining values
if i == (local_nodes - 2):
n_fin = len(uniq_class_noniid) + 1
else:
n_fin += n_class_per_node_noniid
return pctg_distr, num_distr, idx_distr, num_per_node
[docs]
@staticmethod
def dirichlet_method(labels, local_nodes, alpha=1000, random_state=None):
"""
Create a federated dataset divided per each local node (client) using the Dirichlet (dirichlet) method.
Parameters
----------
labels : array-like
The target values (class labels in classification).
local_nodes : int
Number of local nodes (clients) used in the federated learning paradigm.
alpha : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for
the federated data.
random_state : int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
Returns
-------
pctg_distr : array-like
Percentage (between 0 and 1) distribution of the classes for each local node (client).
num_distr : array-like
Numbers of distribution of the classes for each local node (client).
idx_distr : array-like
Indexes of examples (partition) taken for each local node (client).
num_per_node : array-like
Number of examples per each local node (client).
References
----------
.. [1] (dirichlet) Tao Lin∗, Lingjing Kong∗, Sebastian U. Stich, Martin Jaggi. (2020). Ensemble Distillation for Robust Model Fusion in Federated Learning
https://proceedings.neurips.cc/paper/2020/file/18df51b97ccd68128e994804f3eccc87-Supplemental.pdf
"""
# https://github.com/Xtra-Computing/NIID-Bench/blob/main/partition.py
# https://github.com/IBM/probabilistic-federated-neural-matching/blob/master/experiment.py
labels = np.array(labels)
min_size = 0
num_classes = len(np.unique(labels))
N = labels.shape[0]
random_state_loop = random_state
while min_size < 10:
idx_batch = [[] for _ in range(local_nodes)]
for k in range(num_classes):
idx_k = np.where(labels == k)[0]
np.random.seed(random_state_loop)
np.random.shuffle(idx_k)
proportions = np.random.dirichlet(np.repeat(alpha, local_nodes))
# Balance
proportions = np.array([p * (len(idx_j) < N / local_nodes) for p, idx_j in zip(proportions, idx_batch)])
proportions = proportions / proportions.sum()
proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
min_size = min([len(idx_j) for idx_j in idx_batch])
if random_state is not None:
random_state_loop += 100
pctg_distr = []
num_distr = []
idx_distr = []
num_per_node = []
random_state_loop = random_state
for j in range(local_nodes):
np.random.seed(random_state_loop)
np.random.shuffle(idx_batch[j])
# Get examples for each batch from labels
aux_examples_node = labels[idx_batch[j]]
# Get distribution of labels
df_aux = pd.DataFrame(aux_examples_node, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(labels), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
num_per_node.append(list(df_node.label))
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr.append(list(df_node.perc))
num_distr.append(aux_examples_node)
idx_distr.append(idx_batch[j])
if random_state is not None:
random_state_loop += 100
return pctg_distr, num_distr, idx_distr, num_per_node
[docs]
@staticmethod
def add_gaussian_noise(feat, mu=0, sigma=0, client_id=0, local_nodes=4, random_state=None):
"""
Add Gaussian random noise to given features.
Parameters
----------
feat : array-like
List of numpy arrays (or pandas dataframe) with images (i.e. features).
mu : float
Mean (“centre”) of the Gaussian distribution.
sigma : float
Standard deviation (noise) of the Gaussian distribution. Must be non-negative.
client_id : int
Identification or number of the client to add the noise.
local_nodes : int
Number of local nodes (clients) used in the federated learning paradigm.
random_state: int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
Returns
-------
feat : array-like
List of numpy arrays (or pandas dataframe) with images (i.e. features) with the random noise applied.
References
----------
.. [1] (gaussian noise) Li, Q., Diao, Y., Chen, Q., & He, B. (2022, May). Federated learning on non-iid data silos: An experimental study. In 2022 IEEE 38th International Conference on Data Engineering (ICDE) (pp. 965-978). IEEE.
"""
noise_level = sigma * client_id / local_nodes
np.random.seed(random_state)
noise = np.random.normal(mu, noise_level, feat.shape)
feat = feat + noise
return feat
@staticmethod
def calculate_bins_range(column, sigma_noise, n_bins):
min_val = column.min()
max_val = column.max()
# At 4 deviations from the mean the data will keep almost at 100%
bins_range = np.array(
np.linspace(min_val - 4 * sigma_noise, max_val + 4 * sigma_noise, num=n_bins, endpoint=True))
return bins_range
[docs]
@staticmethod
def create_histogram(flat_input, bins):
"""
Create histogram and bins from given flatted features.
Parameters
----------
flat_input : array-like (flatten)
List of numpy arrays (or pandas dataframe) with images (i.e. features) flatten.
bins : int
Number of bins to use in the histogram.
Returns
-------
histogram : array-like
The values of the histogram. Normalized to sum up to 1.
bin_edges : array-like
The bin edges.
References
----------
"""
histogram, bin_edges = np.histogram(flat_input, bins=bins)
histogram = histogram / flat_input.shape[0]
return histogram
[docs]
@staticmethod
def dirichlet_method_quant_skew(labels, local_nodes, alpha=1000, random_state=None, method="no-quant-skew"):
"""
Create a federated dataset divided per each local node (client) using the Dirichlet (dirichlet) method to evaluate quantity skew.
Parameters
----------
labels : array-like
The target values (class labels in classification).
local_nodes : int
Number of local nodes (clients) used in the federated learning paradigm.
alpha : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for
the federated data.
random_state : int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
method : str
Method to create the federated data based on quantity skew. Possible options: "no-quant-skew"(default), "dirichlet", "minsize-dirichlet"
Returns
-------
pctg_distr : array-like
Percentage (between 0 and 1) distribution of the classes for each local node (client).
num_distr : array-like
Numbers of distribution of the classes for each local node (client).
idx_distr : array-like
Indexes of examples (partition) taken for each local node (client).
num_per_node : array-like
Number of examples per each local node (client).
References
----------
.. [1] (dirichlet) Tao Lin∗, Lingjing Kong∗, Sebastian U. Stich, Martin Jaggi. (2020). Ensemble Distillation for Robust Model Fusion in Federated Learning
https://proceedings.neurips.cc/paper/2020/file/18df51b97ccd68128e994804f3eccc87-Supplemental.pdf
"""
# https://github.com/Xtra-Computing/NIID-Bench/blob/main/partition.py
# https://github.com/IBM/probabilistic-federated-neural-matching/blob/master/experiment.py
labels = np.array(labels)
min_size = 0
N = labels.shape[0]
random_state_loop = random_state
np.random.seed(random_state_loop)
idxs = np.random.permutation(N)
min_require_size = len(np.unique(labels)) * 3
if method == "dirichlet":
while min_size < min_require_size:
proportions = np.random.dirichlet(np.repeat(alpha, local_nodes))
proportions = proportions / proportions.sum()
min_size = np.min(proportions * len(idxs))
elif method == "minsize-dirichlet":
proportions = np.random.dirichlet(np.repeat(alpha, local_nodes))
proportions = proportions / proportions.sum()
proportions = [(min_require_size + 1) / len(idxs) if i < + (min_require_size + 1) / len(idxs) else i for i
in proportions]
proportions = [i / sum(proportions) for i in proportions]
proportions = (np.cumsum(proportions) * len(idxs)).astype(int)[:-1]
idx_batch = np.split(idxs, proportions)
idx_batch = [list(value) for value in idx_batch]
pctg_distr = []
num_distr = []
idx_distr = []
num_per_node = []
random_state_loop = random_state
for j in range(local_nodes):
np.random.seed(random_state_loop)
np.random.shuffle(idx_batch[j])
# Get examples for each batch from labels
aux_examples_node = labels[idx_batch[j]]
# Get distribution of labels
df_aux = pd.DataFrame(aux_examples_node, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(labels), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
num_per_node.append(list(df_node.label))
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr.append(list(df_node.perc))
num_distr.append(aux_examples_node)
idx_distr.append(idx_batch[j])
if random_state is not None:
random_state_loop += 100
return pctg_distr, num_distr, idx_distr, num_per_node
[docs]
@staticmethod
def st_dirichlet_method(labels, local_nodes, alpha=1000, random_state=None, st_variable=None):
"""
Create a federated dataset divided per each local node (client) using the Dirichlet (dirichlet) method.
Parameters
----------
labels : array-like
The target values (class labels in classification).
local_nodes : int
Number of local nodes (clients) used in the federated learning paradigm.
alpha : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for
the federated data.
random_state : int
Controls the shuffling applied to the generation of pseudorandom numbers. Pass an int for reproducible
output across multiple function calls.
st_variable : array-like
The spatio-temporal variable from the centralized data.
Returns
-------
pctg_distr : array-like
Percentage (between 0 and 1) distribution of the classes for each local node (client).
num_distr : array-like
Numbers of distribution of the classes for each local node (client).
idx_distr : array-like
Indexes of examples (partition) taken for each local node (client).
num_per_node : array-like
Number of examples per each local node (client).
pctg_distr_st_var : array-like
Percentage (between 0 and 1) distribution of the spatio-temporal variable's categories for each local node (client).
References
----------
"""
st_variable = np.array(st_variable)
labels = np.array(labels)
# print(st_variable)
# print(st_variable)
min_size = 0
num_categ = len(np.unique(st_variable))
N = st_variable.shape[0]
random_state_loop = random_state
while min_size < 10:
idx_batch = [[] for _ in range(local_nodes)]
for k in range(num_categ):
idx_k = np.where(st_variable == k)[0]
np.random.seed(random_state_loop)
np.random.shuffle(idx_k)
proportions = np.random.dirichlet(np.repeat(alpha, local_nodes))
# Balance
proportions = np.array([p * (len(idx_j) < N / local_nodes) for p, idx_j in zip(proportions, idx_batch)])
proportions = proportions / proportions.sum()
proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
min_size = min([len(idx_j) for idx_j in idx_batch])
if random_state is not None:
random_state_loop += 100
pctg_distr = []
num_distr = []
idx_distr = []
num_per_node = []
pctg_distr_st_var = []
random_state_loop = random_state
for j in range(local_nodes):
np.random.seed(random_state_loop)
np.random.shuffle(idx_batch[j])
aux_examples_node = labels[idx_batch[j]]
# Get distribution of labels
df_aux = pd.DataFrame(aux_examples_node, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(labels), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
num_per_node.append(list(df_node.label))
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr.append(list(df_node.perc))
num_distr.append(aux_examples_node)
idx_distr.append(idx_batch[j])
# Get spatio-temporal variable distribution per node
aux_examples_node = st_variable[idx_batch[j]]
# Get distribution of labels
df_aux = pd.DataFrame(aux_examples_node, columns=['st_var']).st_var.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(st_variable), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
df_node['perc'] = df_node.st_var / sum(df_node.st_var)
pctg_distr_st_var.append(list(df_node.perc))
if random_state is not None:
random_state_loop += 100
return pctg_distr, num_distr, idx_distr, num_per_node, pctg_distr_st_var
[docs]
def create_clients(self, image_list, label_list, num_clients=4, prefix_cli='client', method="dirichlet",
alpha=1000, percent_noniid=0, sigma_noise=0, bins='n_samples', feat_sample_rate=0.1,
feat_skew_method="gaussian-noise", alpha_feat_split=1000, idx_feat='feat-mean',
feat_quantile=20, quant_skew_method="no-quant-skew", alpha_quant_split=1000,
spa_temp_skew_method="no-spatemp-skew", alpha_spa_temp=1000, spa_temp_var=None):
"""
Create a federated dataset divided per each local node (client) using the desired method (percent_noniid or dirichlet). It works only for classification problems (labels as classes) with quantitaive (numeric) features.
Parameters
----------
image_list : array-like
List of numpy arrays (or pandas dataframe) with images (i.e. features) from the centralized data.
label_list : array-like
The target values (class labels in classification) from the centralized data.
num_clients : int
Number of local nodes (clients) used in the federated learning paradigm.
prefix_cli : str
The clients' name prefix, e.g., client_1, client_2, etc.
method : string
Method to create the federated data based on label skew. Possible options: "percent_noniid"(default), "dirichlet", "no-label-skew"
alpha : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for the labels of the federated data.
percent_noniid : float
Percentage (between o and 100) desired of non-IID-ness for the labels of the federated data.
sigma_noise : float
Noise (sigma parameter of Gaussian distro) to be added to the features. Applicable only for feat_skew_method="gaussian-noise".
bins : int or str
Number of bins used to create histogram of features to check feature skew. It can be the word 'n_samples' or the integer number of bins to use. If 'n_samples'(default) is selected, then it is set as the number values of the image_list (examples). Applicable only for feat_skew_method="gaussian-noise".
feat_sample_rate : float
Proportion (between 0 and 1) to be sampled from features. This parameter is useful when dealing with datasets with many features (i.e. images). Applicable only for feat_skew_method="gaussian-noise".
feat_skew_method : str
Method to create the federated data based on feature skew. Possible options: "gaussian-noise"(default), "hist-dirichlet"
alpha_feat_split : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for the features of the federated data. Applicable only for feat_skew_method="hist-dirichlet".
idx_feat : int or str
Position (idx) of feature used to simulate feature skew. It can be the word 'feat-mean' or the integer number of the position to use. If 'feat-mean'(default) is selected, then the mean of all the features is computed as representative of the features. Applicable only for feat_skew_method="hist-dirichlet".
feat_quantile : int
Number quantiles to use in the feature skew simulation. 20 for ventiles (default), 10 for deciles, 4 for quartiles, etc. Applicable only for feat_skew_method="hist-dirichlet".
quant_skew_method : str
Method to create the federated data based on quantity skew. Possible options: "no-quant-skew"(default), "dirichlet", "minsize-dirichlet"
alpha_quant_split : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for the quantity skew of the federated data. Applicable only for quant_skew_method="dirichlet".
spa_temp_skew_method : str
Method to create the federated data based on spatio-temporal skew. Possible options: "no-spatemp-skew"(default), "st-dirichlet"
alpha_spa_temp : float
Concentration parameter of the Dirichlet distribution defining the desired degree of non-IID-ness for the spatio-temporal skew of the federated data. Applicable only for spa_temp_skew_method="st-dirichlet".
spa_temp_var : array-like
The spatio-temporal variable from the centralized data. Applicable only for spa_temp_skew_method="st-dirichlet".
Returns
-------
fed_data : dict
Contains features (images) and labels for each local node (client) after federating the data. Includes "with_class_completion" and "without_class_completion" cases.
ids_list_fed_data : array-like
Indexes of examples (partition) taken for each local node (client).
num_missing_classes : array-like
Number of missing classes per each local node when creating the federated dataset
distances : dict
Distances calculated while measuring heterogeneity (non-IID-ness) of the label's distribution among clients. Includes "with_class_completion" and "without_class_completion" cases.
spatemp_fed_data : dict
Contains categories of the spatio-temporal variable for each local node (client) after federating the data. It is generated only when spa_temp_skew_method = "st-dirichlet".
Note: When creating federated data and setting heterogeneous distributions (i.e. high values of percent_noniid or small values of alpha), it is more likely the clients hold examples from only one class. Then, two cases (for labels and features) are returned as output for fed_data and distances:
- "with_class_completion": In this case, the clients are completed with one (random) example of each missing class for each client to have all the label's classes.
- "without_class_completion": In this case, the clients are NOT completed with one (random) example of each missing class. Consequently, summing the number of examples of each client results in the same number of total examples (number of rows in image_list).
References
----------
.. [1] (dirichlet) Tao Lin∗, Lingjing Kong∗, Sebastian U. Stich, Martin Jaggi. (2020). Ensemble Distillation for Robust Model Fusion in Federated Learning0
https://proceedings.neurips.cc/paper/2020/file/18df51b97ccd68128e994804f3eccc87-Supplemental.pdf
.. [2] (percent_noniid) Hsieh, K., Phanishayee, A., Mutlu, O., & Gibbons, P. (2020, November). The non-iid data quagmire of decentralized machine learning. In International Conference on Machine Learning (pp. 4387-4398).PMLR.
https://proceedings.mlr.press/v119/hsieh20a/hsieh20a.pdf
.. [3] (gaussian noise) Li, Q., Diao, Y., Chen, Q., & He, B. (2022, May). Federated learning on non-iid data silos: An experimental study. In 2022 IEEE 38th International Conference on Data Engineering (ICDE) (pp. 965-978). IEEE.
Examples
--------
>>> from fedartml import SplitAsFederatedData
>>> from keras.datasets import mnist
>>> (train_X, train_y), (test_X, test_y) = mnist.load_data()
>>> my_federater = SplitAsFederatedData(random_state=0)
>>>
>>> # Using percent_noniid method
>>> clients_glob, list_ids_sampled, miss_class_per_node, distances =
>>> my_federater.create_clients(image_list=train_X, label_list=train_y, num_clients=4,
>>> prefix_cli='Local_node',method="percent_noniid", percent_noniid=0)
>>>
>>> # Using dirichlet method
>>> clients_glob, list_ids_sampled, miss_class_per_node, distances =
>>> my_federater.create_clients(image_list=train_X, label_list=train_y, num_clients=4,
>>> prefix_cli='Local_node',method="dirichlet", alpha=1000)
"""
# create a list of client names
client_names = ['{}_{}'.format(prefix_cli, i + 1) for i in range(num_clients)]
# Zip the data as list
data = list(zip(image_list, label_list))
if (method == "percent_noniid" or method == "dirichlet") and feat_skew_method == "hist-dirichlet":
raise ValueError(
"The hist-dirichlet method can't be used simultaneously with dirichlet nor percent_noniid label skew methods. If you intent to use hist-dirichlet use method == 'no-label-skew'")
elif (quant_skew_method == "dirichlet") and feat_skew_method == "hist-dirichlet":
raise ValueError(
"The hist-dirichlet method can't be used simultaneously with dirichlet quantity skew methods. If you intent to use hist-dirichlet use quant_skew_method == 'no-quant-skew'")
if (method == "percent_noniid" or method == "dirichlet") and quant_skew_method == "dirichlet":
raise ValueError(
"The dirichlet (for quantity skew) method can't be used simultaneously with dirichlet nor percent_noniid label skew methods. If you intent to use dirichlet (for quantity skew) use method == 'no-label-skew'")
if (
method == "no-label-skew" and quant_skew_method == "no-quant-skew" and spa_temp_skew_method == "no-spatemp-skew") and feat_skew_method == "gaussian-noise":
raise ValueError(
"When using Gaussian Noise (for feature skew) either 'method', 'quant_skew_method' or 'temp_skew_method' should be different to 'no-label-skew','no-quant-skew' or 'no-spatemp-skew', respectively.")
elif feat_skew_method == "gaussian-noise":
num_missing_classes = []
# Set list to append labels and features for each client
shards_no_completion = []
shards_with_completion = []
random_state_loop = self.random_state
# List to append the position of the recordings extracted
ids_list_no_completion = []
ids_list_with_completion = []
if method == "dirichlet":
lbl_distro_clients_pctg, lbl_distro_clients_num, lbl_distro_clients_idx, num_per_node = \
self.dirichlet_method(labels=label_list, local_nodes=num_clients, alpha=alpha,
random_state=random_state_loop)
elif method == "percent_noniid":
lbl_distro_clients_pctg, lbl_distro_clients_num, lbl_distro_clients_idx, num_per_node = \
self.percent_noniid_method(labels=label_list, local_nodes=num_clients, pct_noniid=percent_noniid,
random_state=random_state_loop)
elif quant_skew_method == "dirichlet":
lbl_distro_clients_pctg, lbl_distro_clients_num, lbl_distro_clients_idx, num_per_node = \
self.dirichlet_method_quant_skew(labels=label_list, local_nodes=num_clients,
alpha=alpha_quant_split, random_state=random_state_loop,
method=quant_skew_method)
elif quant_skew_method == "minsize-dirichlet":
lbl_distro_clients_pctg, lbl_distro_clients_num, lbl_distro_clients_idx, num_per_node = \
self.dirichlet_method_quant_skew(labels=label_list, local_nodes=num_clients,
alpha=alpha_quant_split, random_state=random_state_loop,
method=quant_skew_method)
elif spa_temp_skew_method == "st-dirichlet":
lbl_distro_clients_pctg, lbl_distro_clients_num, lbl_distro_clients_idx, num_per_node, \
st_var_dist_cli_pctg = self.st_dirichlet_method(labels=label_list, local_nodes=num_clients,
alpha=alpha_spa_temp, random_state=random_state_loop,
st_variable=spa_temp_var)
elif method not in ['percent_noniid', 'dirichlet', 'no-label-skew']:
raise ValueError("Method '" + method +
"' not implemented. Available label skew methods are: ['percent_noniid', "
"'dirichlet', 'no-label-skew'].")
else:
raise ValueError("Method '" + quant_skew_method +
"' not implemented. Available quantity skew methods are: ['dirichlet', "
"'minsize-dirichlet', 'no-quant-skew']")
# Calculate Jensen-Shannon distance
JS_dist = jensen_shannon_distance(lbl_distro_clients_pctg)
# Calculate Hellinger distance
H_dist = hellinger_distance(lbl_distro_clients_pctg)
# Calculate Earth Mover’s distance
emd_dist = earth_movers_distance(lbl_distro_clients_pctg)
distances = {'without_class_completion': {'jensen-shannon': JS_dist, 'hellinger': H_dist,
'earth-movers': emd_dist}}
data_df = pd.DataFrame(data)
data_df.columns = [*data_df.columns[:-1], 'class']
if spa_temp_skew_method == "st-dirichlet":
spatem_df = pd.DataFrame(spa_temp_var, columns=['spatemp_var'])
fed_data = {}
ids_list_fed_data = {}
pctg_distr = []
dist_hist_no_completion = []
dist_hist_with_completion = []
st_var_cli_list = []
spatemp_fed_data = {}
# Define number of bins for histogram
if bins == 'n_samples':
n_bins = np.array(image_list).shape[0]
else:
n_bins = bins
shape_x = np.array(np.array(image_list).shape)
# Select randomly some features for measuring feature skew
feat_sample_size = max(int(feat_sample_rate * np.prod(shape_x[1:])), 1)
np.random.seed(self.random_state)
idx_samp_feat = np.random.choice(np.arange(np.prod(shape_x[1:])), size=feat_sample_size, replace=False)
features = np.array(image_list).reshape((shape_x[0], np.prod(shape_x[1:])))[:, idx_samp_feat]
# Calculate bins_range with noise
bins_range = np.apply_along_axis(self.calculate_bins_range, axis=0, arr=features, sigma_noise=sigma_noise,
n_bins=n_bins)
for i in range(num_clients):
X = data_df.iloc[lbl_distro_clients_idx[i], 0].values
y = data_df.iloc[lbl_distro_clients_idx[i], 1].values
if isinstance(X[0], list):
X = np.array(X.tolist())
if sigma_noise > 0:
X = self.add_gaussian_noise(feat=X, sigma=sigma_noise, client_id=i + 1, local_nodes=num_clients,
random_state=random_state_loop)
X = np.array(X.tolist())
# flattenX = np.concatenate([np.ravel(X[j]) for j in range(X.shape[0])])
# Select randomly some features for measuring feature skew
shape_x = np.array(X.shape)
features = X.reshape((shape_x[0], np.prod(shape_x[1:])))[:, idx_samp_feat]
# Calculate histograms for each column
histograms = np.array([self.create_histogram(column, bins) for column, bins in zip(features.T,
bins_range.T)])
del features
else:
histograms = np.zeros((features.shape[1], 20))
dist_hist_no_completion.append(list(histograms))
del histograms
if i == (num_clients - 1):
# Reshape to make calculations per client
dist_hist_no_completion = np.transpose(np.array(dist_hist_no_completion), (1, 0, 2)).tolist()
dists = np.array(list(map(jensen_shannon_distance, dist_hist_no_completion)))
JS_dist_feat = np.mean(dists)
dists = np.array(list(map(hellinger_distance, dist_hist_no_completion)))
H_dist_feat = np.mean(dists)
dists = np.array(list(map(earth_movers_distance, dist_hist_no_completion)))
emd_dist_feat = np.mean(dists)
del dist_hist_no_completion
X = X.tolist()
y = y.tolist()
# Get the index (name) of the recordings sampled
ids_list_no_completion.append(lbl_distro_clients_idx[i])
shards_no_completion.append(list(zip(X, y)))
if spa_temp_skew_method == "st-dirichlet":
st_var_cli = spatem_df.iloc[lbl_distro_clients_idx[i], 0].values.tolist()
st_var_cli_list.append(list(st_var_cli))
# Add missing classes when sampling (mainly for extreme case percent iid = 100)
# diff_classes = list(set(label_list) - set(lbl_distro_clients_num[i]))
diff_classes = list(set(label_list) - set(y))
num_diff_classes = len(diff_classes)
num_missing_classes.append(num_diff_classes)
if num_diff_classes > 0:
for k in diff_classes:
vals = [idx for idx, y in enumerate(label_list) if y == k][0]
lbl_distro_clients_idx[i] = lbl_distro_clients_idx[i] + [vals]
X = data_df.iloc[lbl_distro_clients_idx[i], 0].values
y = data_df.iloc[lbl_distro_clients_idx[i], 1].values
if isinstance(X[0], list):
X = np.array(X.tolist())
if sigma_noise > 0:
X = self.add_gaussian_noise(feat=X, sigma=sigma_noise, client_id=i + 1, local_nodes=num_clients,
random_state=random_state_loop)
X = np.array(X.tolist())
# flattenX = np.concatenate([np.ravel(X[j]) for j in range(X.shape[0])])
# Select randomly some features for measuring feature skew
shape_x = np.array(X.shape)
features = X.reshape((shape_x[0], np.prod(shape_x[1:])))[:, idx_samp_feat]
# Calculate histograms for each column
histograms = np.array([self.create_histogram(column, bins) for column, bins in zip(features.T,
bins_range.T)])
del features
else:
histograms = np.zeros((features.shape[1], 20))
dist_hist_with_completion.append(list(histograms))
del histograms
X = X.tolist()
y = y.tolist()
# Get distribution of labels
df_aux = pd.DataFrame(y, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(label_list), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr.append(list(df_node.perc))
# Get the index (name) of the recordings sampled
ids_list_with_completion.append(lbl_distro_clients_idx[i])
shards_with_completion.append(list(zip(X, y)))
if self.random_state is not None:
random_state_loop += self.random_state + 100
# Reshape to make calculations per client
dist_hist_with_completion = np.transpose(np.array(dist_hist_with_completion), (1, 0, 2)).tolist()
# Add elements to dictionary of federated data
fed_data['with_class_completion'] = \
{client_names[i]: shards_with_completion[i] for i in range(len(client_names))}
fed_data['without_class_completion'] = \
{client_names[i]: shards_no_completion[i] for i in range(len(client_names))}
# Add elements to dictionary of ids list of federated data
ids_list_fed_data['with_class_completion'] = ids_list_with_completion
ids_list_fed_data['without_class_completion'] = ids_list_no_completion
# Calculate Jensen-Shannon distance for labels
JS_dist = jensen_shannon_distance(pctg_distr)
# Calculate Hellinger distance for labels
HD_dist = hellinger_distance(pctg_distr)
# Calculate Earth Mover’s distance for labels
emd_dist = earth_movers_distance(pctg_distr)
distances['with_class_completion'] = {'jensen-shannon': JS_dist, 'hellinger': HD_dist,
'earth-movers': emd_dist}
distances['without_class_completion_feat'] = {'jensen-shannon': JS_dist_feat, 'hellinger': H_dist_feat,
'earth-movers': emd_dist_feat}
dists = np.array(list(map(jensen_shannon_distance, dist_hist_with_completion)))
JS_dist_feat = np.mean(dists)
dists = np.array(list(map(hellinger_distance, dist_hist_with_completion)))
H_dist_feat = np.mean(dists)
dists = np.array(list(map(earth_movers_distance, dist_hist_with_completion)))
emd_dist_feat = np.mean(dists)
distances['with_class_completion_feat'] = {'jensen-shannon': JS_dist_feat, 'hellinger': H_dist_feat,
'earth-movers': emd_dist_feat}
if spa_temp_skew_method == "st-dirichlet":
spatemp_fed_data['without_class_completion'] = \
{client_names[i]: st_var_cli_list[i] for i in range(len(client_names))}
elif feat_skew_method == "hist-dirichlet":
num_missing_classes = []
# Set list to append labels and features for each client
shards_no_completion = []
shards_with_completion = []
random_state_loop = self.random_state
# List to append the position of the recordings extracted
ids_list_no_completion = []
ids_list_with_completion = []
shape_x = np.array(np.array(image_list).shape)
# Define feature selected
if idx_feat == 'feat-mean':
feature_selected = np.mean(np.array(image_list).reshape((shape_x[0], np.prod(shape_x[1:]))), axis=1)
else:
feature_selected = np.array(image_list).reshape((shape_x[0], np.prod(shape_x[1:])))[:, idx_feat]
# Get ventiles from feature selected
feature_selected = pd.DataFrame(feature_selected, columns=['feature_selected'])
feature_selected = pd.qcut(feature_selected['feature_selected'], feat_quantile, labels=False,
duplicates='drop')
feat_distro_clients_pctg, feat_distro_clients_num, feat_distro_clients_idx, num_per_node = \
self.dirichlet_method(labels=feature_selected, local_nodes=num_clients, alpha=alpha_feat_split,
random_state=random_state_loop)
data_df = pd.DataFrame(data)
data_df.columns = [*data_df.columns[:-1], 'class']
fed_data = {}
ids_list_fed_data = {}
pctg_distr_no_completion = []
pctg_distr_with_completion = []
feat_pctg_distr_with_completion = []
for i in range(num_clients):
X = data_df.iloc[feat_distro_clients_idx[i], 0].values
y = data_df.iloc[feat_distro_clients_idx[i], 1].values
if isinstance(X[0], list):
X = np.array(X.tolist())
X = X.tolist()
y = y.tolist()
# Get the index (name) of the recordings sampled
ids_list_no_completion.append(feat_distro_clients_idx[i])
shards_no_completion.append(list(zip(X, y)))
# Get distribution of labels
df_aux = pd.DataFrame(y, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(label_list), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr_no_completion.append(list(df_node.perc))
# Add missing classes when sampling (mainly for extreme case percent iid = 100)
diff_classes = list(set(label_list) - set(y))
num_diff_classes = len(diff_classes)
num_missing_classes.append(num_diff_classes)
if num_diff_classes > 0:
for k in diff_classes:
vals = [idx for idx, y in enumerate(label_list) if y == k][0]
feat_distro_clients_idx[i] = feat_distro_clients_idx[i] + [vals]
X = data_df.iloc[feat_distro_clients_idx[i], 0].values
y = data_df.iloc[feat_distro_clients_idx[i], 1].values
if isinstance(X[0], list):
X = np.array(X.tolist())
X = X.tolist()
y = y.tolist()
# Get distribution of labels
df_aux = pd.DataFrame(y, columns=['label']).label.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(label_list), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
df_node['perc'] = df_node.label / sum(df_node.label)
pctg_distr_with_completion.append(list(df_node.perc))
# Get distribution of feature
df_aux = pd.DataFrame(feature_selected.values[feat_distro_clients_idx[i]],
columns=['feature']).feature.value_counts().reset_index()
df_node = pd.DataFrame(np.unique(feature_selected), columns=['index'])
df_node = df_node.merge(df_aux, how='left', left_on='index', right_on='index').replace(np.nan, 0)
df_node['perc'] = df_node.feature / sum(df_node.feature)
feat_pctg_distr_with_completion.append(list(df_node.perc))
# Get the index (name) of the recordings sampled
ids_list_with_completion.append(feat_distro_clients_idx[i])
shards_with_completion.append(list(zip(X, y)))
if self.random_state is not None:
random_state_loop += self.random_state + 100
# Add elements to dictionary of federated data
fed_data['with_class_completion'] = \
{client_names[i]: shards_with_completion[i] for i in range(len(client_names))}
fed_data['without_class_completion'] = \
{client_names[i]: shards_no_completion[i] for i in range(len(client_names))}
# Add elements to dictionary of ids list of federated data
ids_list_fed_data['with_class_completion'] = ids_list_with_completion
ids_list_fed_data['without_class_completion'] = ids_list_no_completion
# Calculate Jensen-Shannon distance for labels (no completion)
JS_dist = jensen_shannon_distance(pctg_distr_no_completion)
# Calculate Hellinger distance for labels (no completion)
HD_dist = hellinger_distance(pctg_distr_no_completion)
# Calculate Earth Mover’s distance for labels (no completion)
emd_dist = earth_movers_distance(pctg_distr_no_completion)
distances = {'without_class_completion': {'jensen-shannon': JS_dist, 'hellinger': HD_dist,
'earth-movers': emd_dist}}
# Calculate Jensen-Shannon distance for labels (with class completion)
JS_dist = jensen_shannon_distance(pctg_distr_with_completion)
# Calculate Hellinger distance for labels (with class completion)
HD_dist = hellinger_distance(pctg_distr_with_completion)
# Calculate Earth Mover’s distance for labels (with class completion)
emd_dist = earth_movers_distance(pctg_distr_with_completion)
distances['with_class_completion'] = {'jensen-shannon': JS_dist, 'hellinger': HD_dist,
'earth-movers': emd_dist}
# Calculate Jensen-Shannon distance for features (no completion)
JS_dist_feat = jensen_shannon_distance(feat_distro_clients_pctg)
# Calculate Hellinger distance for features (no completion)
HD_dist_feat = hellinger_distance(feat_distro_clients_pctg)
# Calculate Earth Mover’s distance for features (no completion)
emd_dist_feat = earth_movers_distance(feat_distro_clients_pctg)
distances['without_class_completion_feat'] = {'jensen-shannon': JS_dist_feat, 'hellinger': HD_dist_feat,
'earth-movers': emd_dist_feat}
# Calculate Jensen-Shannon distance for features (with class completion)
JS_dist_feat = jensen_shannon_distance(feat_pctg_distr_with_completion)
# Calculate Hellinger distance for features (with class completion)
HD_dist_feat = hellinger_distance(feat_pctg_distr_with_completion)
# Calculate Earth Mover’s distance for features (with class completion)
emd_dist_feat = earth_movers_distance(feat_pctg_distr_with_completion)
distances['with_class_completion_feat'] = {'jensen-shannon': JS_dist_feat, 'hellinger': HD_dist_feat,
'earth-movers': emd_dist_feat}
else:
raise ValueError("Method '" + feat_skew_method +
"' not implemented. Available feature skew methods are: ['gaussian-noise', "
"'hist-dirichlet']")
# Get sizes of each client
sizes = [len(value) for key, value in fed_data['without_class_completion'].items()]
perc_part_cli = [[elm / sum(sizes)] for elm in sizes]
# Calculate Jensen-Shannon distance for quantity (no completion)
JS_dist_quant = jensen_shannon_distance(perc_part_cli)
# Calculate Hellinger distance for quantity (no completion)
HD_dist_quant = hellinger_distance(perc_part_cli)
# Calculate Earth Mover’s distance for quantity (no completion)
emd_dist_quant = earth_movers_distance(perc_part_cli)
distances['without_class_completion_quant'] = {'jensen-shannon': JS_dist_quant, 'hellinger': HD_dist_quant,
'earth-movers': emd_dist_quant}
# Get sizes of each client
sizes = [len(value) for key, value in fed_data['with_class_completion'].items()]
perc_part_cli = [[elm / sum(sizes)] for elm in sizes]
# Calculate Jensen-Shannon distance for quantity (with class completion)
JS_dist_quant = jensen_shannon_distance(perc_part_cli)
# Calculate Hellinger distance for quantity (with class completion)
HD_dist_quant = hellinger_distance(perc_part_cli)
# Calculate Earth Mover’s distance for quantity (with class completion)
emd_dist_quant = earth_movers_distance(perc_part_cli)
distances['with_class_completion_quant'] = {'jensen-shannon': JS_dist_quant, 'hellinger': HD_dist_quant,
'earth-movers': emd_dist_quant}
if spa_temp_skew_method == "st-dirichlet":
# Spatio Temporal Skew distances
# Calculate Jensen-Shannon distance
JS_dist_spatemp = jensen_shannon_distance(st_var_dist_cli_pctg)
# Calculate Hellinger distance
H_dist_spatemp = hellinger_distance(st_var_dist_cli_pctg)
# Calculate Earth Mover’s distance
emd_dist_spatemp = earth_movers_distance(st_var_dist_cli_pctg)
distances['without_class_completion_spatemp'] = {'jensen-shannon': JS_dist_spatemp,
'hellinger': H_dist_spatemp,
'earth-movers': emd_dist_spatemp}
return fed_data, ids_list_fed_data, num_missing_classes, distances, spatemp_fed_data
else:
return fed_data, ids_list_fed_data, num_missing_classes, distances