import numpy as np
from scipy.stats import entropy
from math import log, e
import pandas as pd
import timeit
def entropy1(labels, base=None):
value, counts = np.unique(labels, return_counts=True)
return entropy(counts, base=base)
def entropy2(labels, base=None):
“”” Computes entropy of label distribution. “””
n_labels = len(labels)
if n_labels <= 1: return 0 value, counts = np.unique(labels, return_counts=True) probs = counts / n_labels n_classes = np.count_nonzero(probs) if n_classes <= 1: return 0 ent = 0. # Compute entropy base = e if base is None else base for i in probs: ent -= i * log(i, base) return ent def entropy3(labels, base=None): vc = pd.Series(labels).value_counts(normalize=True, sort=False) base = e if base is None else base return -(vc * np.log(vc) / np.log(base)).sum() def entropy4(labels, base=None): value, counts = np.unique(labels, return_counts=True) norm_counts = counts / counts.sum() base = e if base is None else base return -(norm_counts * np.log(norm_counts) / np.log(base)).sum() def compute_gini(label_array): """ Calulate the gini index of label list :param label_array: a numpy array of labels shape = (n, 1) :return gini: gini index value """ # Your code goes here (~6 lines) # 2.3 array = np.sort(label_array) index = np.arange(1, array.shape[0] + 1) gini = ((np.sum((2 * index - array.shape[0] - 1) * array)) / (array.shape[0] * np.sum(array))) return gini b = np.zeros((1000)) b[1] = 1 print("gini: ", compute_gini(b)) # labels = np.array([1, 3, 5, 2, 3, 5, 3, 2, 1, 3, 4, 5]) # print(entropy1(labels)) # print(entropy2(labels)) # print(entropy3(labels)) # print(entropy4(labels))