import numpy as np
from .base import Layer
[docs]
class ReLu(Layer):
def __init__(self) -> None:
super().__init__("relu")
[docs]
def forward(self, X):
self.cache["X"] = X
return np.maximum(0, X)
[docs]
def backward(self, dL_dZ):
"""
Backpropagate through ReLU activation.
ReLU derivative: 1 if X > 0, else 0
"""
X = self.cache["X"]
dL_dX = dL_dZ * (X > 0).astype(float)
return dL_dX
[docs]
def copy(self):
new_layer = ReLu()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class LeakyReLu(Layer):
def __init__(self, alpha=0.01) -> None:
super().__init__("leaky_relu")
self.alpha = alpha
[docs]
def forward(self, X):
self.cache["X"] = X
return np.where(X > 0, X, self.alpha * X)
[docs]
def backward(self, dL_dZ):
X = self.cache["X"]
dL_dX = dL_dZ * np.where(X > 0, 1, self.alpha)
return dL_dX
[docs]
def copy(self):
new_layer = LeakyReLu()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class GELU(Layer):
def __init__(self) -> None:
super().__init__("gelu")
self.constant = 0.044715
[docs]
def forward(self, X):
self.cache["X"] = X
inner = (np.sqrt(2.0 / np.pi) * (X + self.constant * X ** 3))
t = Tanh().forward(inner)
self.cache["t"] = t
return 0.5 * X * (1 + t)
[docs]
def backward(self, dL_dZ):
t = self.cache["t"]
X = self.cache["X"]
dL_dX = 0.5 * (1 + t)
dL_dX += 0.5 * X * (1 - t ** 2) * np.sqrt(2.0 / np.pi)
dL_dX *= (1 + 3 * self.constant * X ** 3)
dL_dX *= dL_dZ
return dL_dX
[docs]
def copy(self):
new_layer = GELU()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class Swish(Layer):
def __init__(self) -> None:
super().__init__("swish")
[docs]
def forward(self, X):
self.cache["X"] = X
Z = 1 / (1 + np.exp(-X))
self.cache["Z"] = Z
return X * Z
[docs]
def backward(self, dL_dZ):
X = self.cache["X"]
Z = self.cache["Z"]
dL_dX = dL_dZ * (Z + X * Z * (1 - Z))
return dL_dX
[docs]
def copy(self):
new_layer = Swish()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class Sigmoid(Layer):
def __init__(self) -> None:
super().__init__("sigmoid")
[docs]
def forward(self, X):
self.cache["X"] = X
self.cache["Z"] = 1 / (1 + np.exp(-X))
return self.cache["Z"]
[docs]
def backward(self, dL_dZ):
"""
Backpropagate through Sigmoid activation.
Sigmoid derivative: sigmoid(Z) * (1 - sigmoid(Z))
"""
Z = self.cache["Z"]
dL_dX = dL_dZ * Z * (1 - Z)
return dL_dX
[docs]
def copy(self):
new_layer = Sigmoid()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class Tanh(Layer):
def __init__(self) -> None:
super().__init__("tanh")
[docs]
def forward(self, X):
self.cache["X"] = X
e_x = np.exp(X)
e_neg_x = np.exp(-X)
self.cache["Z"] = (e_x - e_neg_x) / (e_x + e_neg_x)
return self.cache["Z"]
[docs]
def backward(self, dL_dZ):
"""
Backpropagate through Tanh activation.
Tanh derivative: 1 - tanh(Z)^2
"""
Z = self.cache["Z"]
dL_dX = dL_dZ * (1 - Z**2)
return dL_dX
[docs]
def copy(self):
new_layer = Tanh()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class Softmax(Layer):
def __init__(self) -> None:
super().__init__("softmax")
[docs]
def forward(self, X):
self.cache["X"] = X
axis = None if X.ndim < 2 else 1
max_a = np.max(X, axis=axis, keepdims=True)
dividend = np.exp(X - max_a)
divisor = np.sum(np.exp(X - max_a), axis=axis, keepdims=True)
self.cache["Z"] = dividend / divisor
return self.cache["Z"]
[docs]
def backward(self, dL_dZ):
"""
Backpropagate through Softmax activation.
When paired with CategoricalCrossEntropy, the combined gradient
(y_pred - one_hot(y_true)) / N is computed entirely in the loss,
so this layer is a straight pass-through.
"""
return dL_dZ
[docs]
def copy(self):
new_layer = Softmax()
new_layer.cache = self.cache.copy()
return new_layer
[docs]
class ELU(Layer):
def __init__(self, alpha=1.0) -> None:
super().__init__("elu")
self.alpha_activation = alpha
[docs]
def forward(self, X):
self.cache["X"] = X
self.cache["Z"] = np.where(X > 0, X, self.alpha_activation * (np.exp(X) - 1))
return self.cache["Z"]
[docs]
def backward(self, dL_dZ):
"""
Backpropagate through ELU activation.
ELU derivative: 1 if X > 0, else alpha * exp(X)
"""
X = self.cache["X"]
dL_dX = dL_dZ * np.where(X > 0, 1.0, self.alpha_activation * np.exp(X))
return dL_dX
[docs]
def copy(self):
new_layer = ELU(self.alpha_activation)
new_layer.cache = self.cache.copy()
return new_layer