Slide 1: Understanding Transfer Learning Fundamentals
Transfer learning enables models to leverage knowledge from pre-trained networks, significantly reducing training time and computational resources. This approach allows neural networks to apply learned features from one domain to accelerate learning in another, similar to how humans transfer knowledge between related tasks.
# Basic transfer learning structure
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
# Load pre-trained VGG16 model without top layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
# Freeze base model layers
for layer in base_model.layers:
layer.trainable = False
# Add new layers for transfer learning
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)
# Create new model
transfer_model = Model(inputs=base_model.input, outputs=predictions)
Slide 2: Mathematical Foundation of Transfer Learning
Transfer learning's effectiveness can be understood through domain adaptation theory, where knowledge from a source domain is transferred to a target domain. The mathematical framework involves minimizing the distance between feature distributions while maintaining discriminative power.
# Mathematical representation (LaTeX notation)
"""
Domain Adaptation Objective Function:
$$\min_{θ} \mathcal{L}_{task}(θ) + λ\mathcal{L}_{adapt}(θ)$$
Where:
$$\mathcal{L}_{task}$$ is the task-specific loss
$$\mathcal{L}_{adapt}$$ is the domain adaptation loss
$$λ$$ is the adaptation trade-off parameter
"""
Slide 3: Setting Up the Data Pipeline
Creating an efficient data pipeline is crucial for transfer learning success. This implementation demonstrates how to prepare and preprocess data using TensorFlow's data API, including augmentation techniques suitable for transfer learning scenarios.
def create_data_pipeline(image_paths, labels, batch_size=32):
# Create dataset from image paths and labels
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
def process_path(path, label):
img = tf.io.read_file(path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, [224, 224])
img = tf.keras.applications.vgg16.preprocess_input(img)
return img, label
dataset = dataset.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
return dataset
Slide 4: Feature Extraction Implementation
Feature extraction represents the first phase of transfer learning, where we utilize pre-trained weights to extract meaningful features from our new dataset without modifying the original model's weights.
def create_feature_extractor(base_model, num_classes):
# Create feature extractor
feature_extractor = tf.keras.Sequential([
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(num_classes, activation='softmax')
])
# Compile model
feature_extractor.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss='categorical_crossentropy',
metrics=['accuracy']
)
return feature_extractor
Slide 5: Fine-tuning Strategy
Fine-tuning involves carefully unfreezing specific layers of the pre-trained model and training them at a lower learning rate. This process allows the model to adapt its learned features to the new domain while preventing catastrophic forgetting.
def implement_fine_tuning(model, num_layers_to_unfreeze=5):
# Unfreeze the last n layers
for layer in model.layers[-num_layers_to_unfreeze:]:
layer.trainable = True
# Recompile with lower learning rate
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
loss='categorical_crossentropy',
metrics=['accuracy']
)
return model
Slide 6: Custom Layer Adaptation
When performing transfer learning, custom layers can be added to better adapt the model to specific domain requirements. This implementation shows how to create and integrate custom layers while maintaining the pre-trained network's knowledge.
import tensorflow as tf
class AdaptiveLayer(tf.keras.layers.Layer):
def __init__(self, units, activation='relu'):
super(AdaptiveLayer, self).__init__()
self.units = units
self.activation = tf.keras.activations.get(activation)
def build(self, input_shape):
self.w = self.add_weight(
shape=(input_shape[-1], self.units),
initializer='glorot_uniform',
trainable=True,
name='adaptive_weights'
)
self.b = self.add_weight(
shape=(self.units,),
initializer='zeros',
trainable=True,
name='adaptive_bias'
)
def call(self, inputs):
return self.activation(tf.matmul(inputs, self.w) + self.b)
Slide 7: Progressive Fine-tuning Implementation
Progressive fine-tuning gradually unfreezes and trains layers from top to bottom, allowing for more controlled adaptation of the pre-trained features while maintaining lower-level feature representations.
def progressive_fine_tuning(model, train_data, validation_data, epochs_per_stage=5):
history = []
trainable_layers = [layer for layer in model.layers if len(layer.weights) > 0]
for i in range(len(trainable_layers)):
print(f"Stage {i+1}: Unfreezing layer {trainable_layers[-i-1].name}")
trainable_layers[-i-1].trainable = True
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5/(i+1)),
loss='categorical_crossentropy',
metrics=['accuracy']
)
stage_history = model.fit(
train_data,
validation_data=validation_data,
epochs=epochs_per_stage,
verbose=1
)
history.append(stage_history.history)
return history
Slide 8: Implementation of Domain Adaptation
Domain adaptation techniques help bridge the gap between source and target domains. This implementation demonstrates how to calculate and minimize domain discrepancy using Maximum Mean Discrepancy (MMD).
def compute_mmd(source_features, target_features):
def gaussian_kernel(x, y, sigma=1.0):
return tf.exp(-tf.reduce_sum(tf.square(x - y)) / (2 * sigma**2))
def compute_kernel_means(x, y):
xx = tf.reduce_mean(gaussian_kernel(x[:, None], x[None, :]))
xy = tf.reduce_mean(gaussian_kernel(x[:, None], y[None, :]))
yy = tf.reduce_mean(gaussian_kernel(y[:, None], y[None, :]))
return xx - 2 * xy + yy
mmd_loss = compute_kernel_means(source_features, target_features)
return mmd_loss
class DomainAdaptationModel(tf.keras.Model):
def __init__(self, base_model, num_classes):
super(DomainAdaptationModel, self).__init__()
self.feature_extractor = base_model
self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')
def call(self, inputs):
features = self.feature_extractor(inputs)
predictions = self.classifier(features)
return features, predictions
Slide 9: Real-world Example: Medical Image Classification
Transfer learning is particularly valuable in medical imaging where labeled data is scarce. This implementation shows how to adapt a pre-trained model for X-ray classification tasks.
def create_medical_classifier():
# Load pre-trained DenseNet121
base_model = tf.keras.applications.DenseNet121(
weights='imagenet',
include_top=False,
input_shape=(224, 224, 3)
)
# Custom layers for medical imaging
model = tf.keras.Sequential([
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(2, activation='sigmoid') # Binary classification
])
# Compile with weighted loss for imbalanced datasets
model.compile(
optimizer=tf.keras.optimizers.Adam(1e-4),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy', tf.keras.metrics.AUC()]
)
return model
Slide 10: Results for Medical Image Classification
This implementation showcases the performance metrics and evaluation results from the medical image classification transfer learning model, including confusion matrix and ROC curve generation.
def evaluate_medical_model(model, test_dataset):
# Predict on test set
y_pred = model.predict(test_dataset)
y_true = np.concatenate([y for _, y in test_dataset])
# Calculate metrics
accuracy = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
auc = roc_auc_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"AUC-ROC Score: {auc:.4f}")
# Generate confusion matrix
cm = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
return {
'accuracy': accuracy,
'auc': auc,
'confusion_matrix': cm,
'predictions': y_pred
}
# Example output:
"""
Test Accuracy: 0.9234
AUC-ROC Score: 0.9456
Confusion Matrix:
[[156 12]
[ 8 124]]
"""
Slide 11: Handling Catastrophic Forgetting
Catastrophic forgetting occurs when fine-tuning causes the model to lose previously learned knowledge. This implementation introduces elastic weight consolidation (EWC) to preserve important parameters.
class EWC(tf.keras.callbacks.Callback):
def __init__(self, original_model, fisher_multiplier=100):
super(EWC, self).__init__()
self.original_weights = original_model.get_weights()
self.fisher_multiplier = fisher_multiplier
def calculate_fisher_information(self, model, dataset):
fisher_info = []
for layer_weights in model.trainable_weights:
fisher_info.append(tf.zeros_like(layer_weights))
for x, _ in dataset:
with tf.GradientTape() as tape:
output = model(x, training=True)
log_likelihood = tf.reduce_mean(tf.math.log(output + 1e-7))
grads = tape.gradient(log_likelihood, model.trainable_weights)
for idx, grad in enumerate(grads):
fisher_info[idx] += tf.square(grad)
return fisher_info
def ewc_loss(self, model):
regular_loss = 0
for idx, weights in enumerate(model.trainable_weights):
regular_loss += tf.reduce_sum(
self.fisher_multiplier * tf.square(weights - self.original_weights[idx])
)
return regular_loss
Slide 12: Real-world Example: Natural Language Processing Transfer Learning
This implementation demonstrates how to apply transfer learning principles to NLP tasks using pre-trained transformers while maintaining computational efficiency.
from transformers import TFBertModel, BertTokenizer
def create_nlp_transfer_model(num_classes, max_length=128):
# Load pre-trained BERT
bert = TFBertModel.from_pretrained('bert-base-uncased')
# Create custom model architecture
input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
bert_outputs = bert(input_ids, attention_mask=attention_mask)
pooled_output = bert_outputs[1]
dropout = tf.keras.layers.Dropout(0.3)(pooled_output)
output = tf.keras.layers.Dense(num_classes, activation='softmax')(dropout)
model = tf.keras.Model(
inputs=[input_ids, attention_mask],
outputs=output
)
# Freeze BERT layers initially
for layer in bert.layers:
layer.trainable = False
return model
# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = create_nlp_transfer_model(num_classes=3)
Slide 13: Results for NLP Transfer Learning
The following implementation shows comprehensive performance metrics for the NLP transfer learning model, including precision, recall, and F1 scores across different classes.
def evaluate_nlp_model(model, test_texts, test_labels, tokenizer):
# Tokenize test data
encodings = tokenizer(
test_texts,
truncation=True,
padding=True,
max_length=128,
return_tensors='tf'
)
# Generate predictions
predictions = model.predict([
encodings['input_ids'],
encodings['attention_mask']
])
# Calculate metrics
results = {
'accuracy': accuracy_score(test_labels, predictions.argmax(axis=1)),
'precision': precision_score(test_labels, predictions.argmax(axis=1), average='weighted'),
'recall': recall_score(test_labels, predictions.argmax(axis=1), average='weighted'),
'f1': f1_score(test_labels, predictions.argmax(axis=1), average='weighted')
}
print("Model Performance Metrics:")
for metric, value in results.items():
print(f"{metric.capitalize()}: {value:.4f}")
return results
# Example output:
"""
Model Performance Metrics:
Accuracy: 0.8956
Precision: 0.8872
Recall: 0.8956
F1: 0.8913
"""
Slide 14: Knowledge Distillation Implementation
Knowledge distillation combines transfer learning with model compression, allowing smaller models to learn from larger pre-trained ones while maintaining performance.
class DistillationModel(tf.keras.Model):
def __init__(self, student_model, teacher_model, temperature=3.0):
super(DistillationModel, self).__init__()
self.student_model = student_model
self.teacher_model = teacher_model
self.temperature = temperature
def compile(self, optimizer, metrics):
super(DistillationModel, self).compile(optimizer=optimizer, metrics=metrics)
self.distillation_loss_tracker = tf.keras.metrics.Mean(name="distillation_loss")
def train_step(self, data):
x, y = data
# Teacher predictions
teacher_predictions = self.teacher_model(x, training=False)
with tf.GradientTape() as tape:
# Student predictions
student_predictions = self.student_model(x, training=True)
# Calculate losses
distillation_loss = tf.keras.losses.KLDivergence()(
tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
tf.nn.softmax(student_predictions / self.temperature, axis=1)
)
student_loss = self.compiled_loss(y, student_predictions)
total_loss = (0.7 * student_loss) + (0.3 * distillation_loss)
# Update student weights
trainable_vars = self.student_model.trainable_variables
gradients = tape.gradient(total_loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics
self.compiled_metrics.update_state(y, student_predictions)
self.distillation_loss_tracker.update_state(distillation_loss)
return {m.name: m.result() for m in self.metrics}
Slide 15: Additional Resources
- "Deep Transfer Learning for Medical Imaging" - https://arxiv.org/abs/2004.00235
- "A Survey on Deep Transfer Learning" - https://arxiv.org/abs/1808.01974
- "Progressive Neural Networks" - https://arxiv.org/abs/1606.04671
- "Elastic Weight Consolidation for Better Domain Adaptation" - https://arxiv.org/abs/1906.05873
- For implementation details and more resources, search for "Transfer Learning Surveys" on Google Scholar
- Visit TensorFlow's official documentation for updated transfer learning guides and tutorials