tf.Kreas
實作,過程包含:
Distiller
類別。本篇使用 Keras 官方範例定義的 Distiller
類別。
該類別繼承於 th.keras.Model
,並改寫以下方法:
compile
:這個模型需要一些額外的參數來編譯,比如老師和學生的損失,alpha 和 temp 。train_step
:控制模型的訓練方式。這將是真正的知識蒸餾邏輯所在。這個方法就是你做的時候調用的方法model.fit。test_step
:控制模型的評估。這個方法就是你做的時候調用的方法model.evaluate。class Distiller(keras.Model):
def __init__(self, student, teacher):
super(Distiller, self).__init__()
self.teacher = teacher
self.student = student
def compile(
self,
optimizer,
metrics,
student_loss_fn,
distillation_loss_fn,
alpha=0.1,
temperature=3,
):
""" Configure the distiller.
Args:
optimizer: Keras optimizer for the student weights.
metrics: Keras metrics for evaluation.
student_loss_fn: Loss function of difference between student
predictions and ground-truth.
distillation_loss_fn: Loss function of difference between soft
student predictions and soft teacher predictions.
alpha: weight to student_loss_fn and 1-alpha to
distillation_loss_fn.
temperature: Temperature for softening probability
distributions.
Larger temperature gives softer distributions.
"""
super(Distiller, self).compile(
optimizer=optimizer,
metrics=metrics
)
self.student_loss_fn = student_loss_fn
self.distillation_loss_fn = distillation_loss_fn
self.alpha = alpha
self.temperature = temperature
def train_step(self, data):
# Unpack data
x, y = data
# Forward pass of teacher
teacher_predictions = self.teacher(x, training=False)
with tf.GradientTape() as tape:
# Forward pass of student
student_predictions = self.student(x, training=True)
# Compute losses
student_loss = self.student_loss_fn(y, student_predictions)
distillation_loss = self.distillation_loss_fn(
tf.nn.softmax(
teacher_predictions / self.temperature, axis=1
),
tf.nn.softmax(
student_predictions / self.temperature, axis=1
)
)
loss = self.alpha * student_loss + (
1 - self.alpha) * distillation_loss
# Compute gradients
trainable_vars = self.student.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update the metrics configured in `compile()`.
self.compiled_metrics.update_state(y, student_predictions)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update(
{"student_loss": student_loss,
"distillation_loss": distillation_loss}
)
return results
def test_step(self, data):
# Unpack the data
x, y = data
# Compute predictions
y_prediction = self.student(x, training=False)
# Calculate the loss
student_loss = self.student_loss_fn(y, y_prediction)
# Update the metrics.
self.compiled_metrics.update_state(y, y_prediction)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update({"student_loss": student_loss})
return results
提醒2件事情:
softmax
,因為知識蒸餾需要原始的權重分佈特徵,請記得去掉這層。可以將學生模型視為教師模型的簡化(或壓縮)版本。
def big_model_builder():
keras = tf.keras
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(28, 28)),
keras.layers.Reshape(target_shape=(28, 28, 1)),
keras.layers.Conv2D(
filters=12, kernel_size=(3, 3), activation='relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Conv2D(
filters=12, kernel_size=(3, 3), activation='relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10)
])
return model
def small_model_builder():
keras = tf.keras
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(28, 28)),
keras.layers.Reshape(target_shape=(28, 28, 1)),
keras.layers.Conv2D(
filters=12, kernel_size=(3, 3), activation='relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10)
])
return model
teacher = big_model_builder()
student = small_model_builder()
student_scratch = small_model_builder()
# Train teacher as usual
teacher.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
teacher.summary()
# Train and evaluate teacher on data.
teacher.fit(train_images, train_labels, epochs=2)
_ , ACCURACY['teacher model'] = teacher.evaluate(test_images, test_labels)
Distiller
類別的實例並傳入學生和教師模型distiller = Distiller(student=student, teacher=teacher)
。然後用合適的參數編譯並訓練。
# Initialize and compile distiller
distiller = Distiller(student=student, teacher=teacher)
distiller.compile(
optimizer=keras.optimizers.Adam(),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
student_loss_fn=keras.losses.SparseCategoricalCrossentropy(
from_logits=True),
distillation_loss_fn=keras.losses.KLDivergence(),
alpha=0.1,
temperature=10,
)
# Distill teacher to student
distiller.fit(
train_images,
train_labels,
epochs=2,
shuffle=False
)
# Evaluate student on test dataset
ACCURACY['distiller student model'], _ = distiller.evaluate(
test_images, test_labels)
# Train student as doen usually
student_scratch.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
student_scratch.summary()
# Train and evaluate student trained from scratch.
student_scratch.fit(
train_images,
train_labels,
epochs=2,
shuffle=False
)
# student_scratch.evaluate(x_test, y_test)
_, ACCURACY['student from scrath model'] = student_scratch.evaluate(
test_images,
test_labels
)
ACCURACY
{'teacher model': 0.9822999835014343,
'distiller student model': 0.9729999899864197,
'student from scrath model': 0.9697999954223633}