Skip to content

Latest commit

 

History

History
448 lines (347 loc) · 16.3 KB

Accelerate AI Training with Transfer Learning.md

File metadata and controls

448 lines (347 loc) · 16.3 KB

Accelerate AI Training with Transfer Learning

Slide 1: Understanding Transfer Learning Fundamentals

Transfer learning enables models to leverage knowledge from pre-trained networks, significantly reducing training time and computational resources. This approach allows neural networks to apply learned features from one domain to accelerate learning in another, similar to how humans transfer knowledge between related tasks.

# Basic transfer learning structure
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

# Load pre-trained VGG16 model without top layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze base model layers
for layer in base_model.layers:
    layer.trainable = False

# Add new layers for transfer learning
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)

# Create new model
transfer_model = Model(inputs=base_model.input, outputs=predictions)

Slide 2: Mathematical Foundation of Transfer Learning

Transfer learning's effectiveness can be understood through domain adaptation theory, where knowledge from a source domain is transferred to a target domain. The mathematical framework involves minimizing the distance between feature distributions while maintaining discriminative power.

# Mathematical representation (LaTeX notation)
"""
Domain Adaptation Objective Function:
$$\min_{θ} \mathcal{L}_{task}(θ) + λ\mathcal{L}_{adapt}(θ)$$

Where:
$$\mathcal{L}_{task}$$ is the task-specific loss
$$\mathcal{L}_{adapt}$$ is the domain adaptation loss
$$λ$$ is the adaptation trade-off parameter
"""

Slide 3: Setting Up the Data Pipeline

Creating an efficient data pipeline is crucial for transfer learning success. This implementation demonstrates how to prepare and preprocess data using TensorFlow's data API, including augmentation techniques suitable for transfer learning scenarios.

def create_data_pipeline(image_paths, labels, batch_size=32):
    # Create dataset from image paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    
    def process_path(path, label):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, [224, 224])
        img = tf.keras.applications.vgg16.preprocess_input(img)
        return img, label
    
    dataset = dataset.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

Slide 4: Feature Extraction Implementation

Feature extraction represents the first phase of transfer learning, where we utilize pre-trained weights to extract meaningful features from our new dataset without modifying the original model's weights.

def create_feature_extractor(base_model, num_classes):
    # Create feature extractor
    feature_extractor = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    # Compile model
    feature_extractor.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return feature_extractor

Slide 5: Fine-tuning Strategy

Fine-tuning involves carefully unfreezing specific layers of the pre-trained model and training them at a lower learning rate. This process allows the model to adapt its learned features to the new domain while preventing catastrophic forgetting.

def implement_fine_tuning(model, num_layers_to_unfreeze=5):
    # Unfreeze the last n layers
    for layer in model.layers[-num_layers_to_unfreeze:]:
        layer.trainable = True
    
    # Recompile with lower learning rate
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

Slide 6: Custom Layer Adaptation

When performing transfer learning, custom layers can be added to better adapt the model to specific domain requirements. This implementation shows how to create and integrate custom layers while maintaining the pre-trained network's knowledge.

import tensorflow as tf

class AdaptiveLayer(tf.keras.layers.Layer):
    def __init__(self, units, activation='relu'):
        super(AdaptiveLayer, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)
        
    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer='glorot_uniform',
            trainable=True,
            name='adaptive_weights'
        )
        self.b = self.add_weight(
            shape=(self.units,),
            initializer='zeros',
            trainable=True,
            name='adaptive_bias'
        )
        
    def call(self, inputs):
        return self.activation(tf.matmul(inputs, self.w) + self.b)

Slide 7: Progressive Fine-tuning Implementation

Progressive fine-tuning gradually unfreezes and trains layers from top to bottom, allowing for more controlled adaptation of the pre-trained features while maintaining lower-level feature representations.

def progressive_fine_tuning(model, train_data, validation_data, epochs_per_stage=5):
    history = []
    trainable_layers = [layer for layer in model.layers if len(layer.weights) > 0]
    
    for i in range(len(trainable_layers)):
        print(f"Stage {i+1}: Unfreezing layer {trainable_layers[-i-1].name}")
        trainable_layers[-i-1].trainable = True
        
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5/(i+1)),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        stage_history = model.fit(
            train_data,
            validation_data=validation_data,
            epochs=epochs_per_stage,
            verbose=1
        )
        history.append(stage_history.history)
    
    return history

Slide 8: Implementation of Domain Adaptation

Domain adaptation techniques help bridge the gap between source and target domains. This implementation demonstrates how to calculate and minimize domain discrepancy using Maximum Mean Discrepancy (MMD).

def compute_mmd(source_features, target_features):
    def gaussian_kernel(x, y, sigma=1.0):
        return tf.exp(-tf.reduce_sum(tf.square(x - y)) / (2 * sigma**2))
    
    def compute_kernel_means(x, y):
        xx = tf.reduce_mean(gaussian_kernel(x[:, None], x[None, :]))
        xy = tf.reduce_mean(gaussian_kernel(x[:, None], y[None, :]))
        yy = tf.reduce_mean(gaussian_kernel(y[:, None], y[None, :]))
        return xx - 2 * xy + yy
    
    mmd_loss = compute_kernel_means(source_features, target_features)
    return mmd_loss

class DomainAdaptationModel(tf.keras.Model):
    def __init__(self, base_model, num_classes):
        super(DomainAdaptationModel, self).__init__()
        self.feature_extractor = base_model
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        features = self.feature_extractor(inputs)
        predictions = self.classifier(features)
        return features, predictions

Slide 9: Real-world Example: Medical Image Classification

Transfer learning is particularly valuable in medical imaging where labeled data is scarce. This implementation shows how to adapt a pre-trained model for X-ray classification tasks.

def create_medical_classifier():
    # Load pre-trained DenseNet121
    base_model = tf.keras.applications.DenseNet121(
        weights='imagenet',
        include_top=False,
        input_shape=(224, 224, 3)
    )
    
    # Custom layers for medical imaging
    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(2, activation='sigmoid')  # Binary classification
    ])
    
    # Compile with weighted loss for imbalanced datasets
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )
    
    return model

Slide 10: Results for Medical Image Classification

This implementation showcases the performance metrics and evaluation results from the medical image classification transfer learning model, including confusion matrix and ROC curve generation.

def evaluate_medical_model(model, test_dataset):
    # Predict on test set
    y_pred = model.predict(test_dataset)
    y_true = np.concatenate([y for _, y in test_dataset])
    
    # Calculate metrics
    accuracy = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    auc = roc_auc_score(y_true, y_pred)
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC Score: {auc:.4f}")
    
    # Generate confusion matrix
    cm = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    
    return {
        'accuracy': accuracy,
        'auc': auc,
        'confusion_matrix': cm,
        'predictions': y_pred
    }

# Example output:
"""
Test Accuracy: 0.9234
AUC-ROC Score: 0.9456
Confusion Matrix:
[[156  12]
 [ 8  124]]
"""

Slide 11: Handling Catastrophic Forgetting

Catastrophic forgetting occurs when fine-tuning causes the model to lose previously learned knowledge. This implementation introduces elastic weight consolidation (EWC) to preserve important parameters.

class EWC(tf.keras.callbacks.Callback):
    def __init__(self, original_model, fisher_multiplier=100):
        super(EWC, self).__init__()
        self.original_weights = original_model.get_weights()
        self.fisher_multiplier = fisher_multiplier
        
    def calculate_fisher_information(self, model, dataset):
        fisher_info = []
        for layer_weights in model.trainable_weights:
            fisher_info.append(tf.zeros_like(layer_weights))
            
        for x, _ in dataset:
            with tf.GradientTape() as tape:
                output = model(x, training=True)
                log_likelihood = tf.reduce_mean(tf.math.log(output + 1e-7))
            
            grads = tape.gradient(log_likelihood, model.trainable_weights)
            for idx, grad in enumerate(grads):
                fisher_info[idx] += tf.square(grad)
                
        return fisher_info
    
    def ewc_loss(self, model):
        regular_loss = 0
        for idx, weights in enumerate(model.trainable_weights):
            regular_loss += tf.reduce_sum(
                self.fisher_multiplier * tf.square(weights - self.original_weights[idx])
            )
        return regular_loss

Slide 12: Real-world Example: Natural Language Processing Transfer Learning

This implementation demonstrates how to apply transfer learning principles to NLP tasks using pre-trained transformers while maintaining computational efficiency.

from transformers import TFBertModel, BertTokenizer

def create_nlp_transfer_model(num_classes, max_length=128):
    # Load pre-trained BERT
    bert = TFBertModel.from_pretrained('bert-base-uncased')
    
    # Create custom model architecture
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32)
    
    bert_outputs = bert(input_ids, attention_mask=attention_mask)
    pooled_output = bert_outputs[1]
    
    dropout = tf.keras.layers.Dropout(0.3)(pooled_output)
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(dropout)
    
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=output
    )
    
    # Freeze BERT layers initially
    for layer in bert.layers:
        layer.trainable = False
    
    return model

# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = create_nlp_transfer_model(num_classes=3)

Slide 13: Results for NLP Transfer Learning

The following implementation shows comprehensive performance metrics for the NLP transfer learning model, including precision, recall, and F1 scores across different classes.

def evaluate_nlp_model(model, test_texts, test_labels, tokenizer):
    # Tokenize test data
    encodings = tokenizer(
        test_texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='tf'
    )
    
    # Generate predictions
    predictions = model.predict([
        encodings['input_ids'],
        encodings['attention_mask']
    ])
    
    # Calculate metrics
    results = {
        'accuracy': accuracy_score(test_labels, predictions.argmax(axis=1)),
        'precision': precision_score(test_labels, predictions.argmax(axis=1), average='weighted'),
        'recall': recall_score(test_labels, predictions.argmax(axis=1), average='weighted'),
        'f1': f1_score(test_labels, predictions.argmax(axis=1), average='weighted')
    }
    
    print("Model Performance Metrics:")
    for metric, value in results.items():
        print(f"{metric.capitalize()}: {value:.4f}")
    
    return results

# Example output:
"""
Model Performance Metrics:
Accuracy: 0.8956
Precision: 0.8872
Recall: 0.8956
F1: 0.8913
"""

Slide 14: Knowledge Distillation Implementation

Knowledge distillation combines transfer learning with model compression, allowing smaller models to learn from larger pre-trained ones while maintaining performance.

class DistillationModel(tf.keras.Model):
    def __init__(self, student_model, teacher_model, temperature=3.0):
        super(DistillationModel, self).__init__()
        self.student_model = student_model
        self.teacher_model = teacher_model
        self.temperature = temperature
        
    def compile(self, optimizer, metrics):
        super(DistillationModel, self).compile(optimizer=optimizer, metrics=metrics)
        self.distillation_loss_tracker = tf.keras.metrics.Mean(name="distillation_loss")
        
    def train_step(self, data):
        x, y = data
        
        # Teacher predictions
        teacher_predictions = self.teacher_model(x, training=False)
        
        with tf.GradientTape() as tape:
            # Student predictions
            student_predictions = self.student_model(x, training=True)
            
            # Calculate losses
            distillation_loss = tf.keras.losses.KLDivergence()(
                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                tf.nn.softmax(student_predictions / self.temperature, axis=1)
            )
            
            student_loss = self.compiled_loss(y, student_predictions)
            total_loss = (0.7 * student_loss) + (0.3 * distillation_loss)
            
        # Update student weights
        trainable_vars = self.student_model.trainable_variables
        gradients = tape.gradient(total_loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        # Update metrics
        self.compiled_metrics.update_state(y, student_predictions)
        self.distillation_loss_tracker.update_state(distillation_loss)
        
        return {m.name: m.result() for m in self.metrics}

Slide 15: Additional Resources