scouzi1966 · scouzi1966 · Oct 26, 2025 · sourcery-ai · Oct 26, 2025 · sourcery-ai
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -11,29 +11,40 @@ This documentation covers:
 - Maximum response tokens configuration
 - Usage examples and best practices
 
-### Future Implementation Roadmap
+### Advanced Sampling Features
 
-The following advanced sampling parameters will be implemented in future versions:
+The following advanced sampling parameters are now available:
 
 #### Top-P Sampling (Nucleus Sampling)
 - [random(probabilityThreshold:seed:)](https://developer.apple.com/documentation/foundationmodels/generationoptions/samplingmode/random(probabilitythreshold:seed:))
 - Controls diversity by only considering tokens whose cumulative probability is below the threshold
-- Planned parameter: `--top-p <value>` (0.0-1.0 range)
+- Usage: `--randomness "random:top-p=0.9"` (0.0-1.0 range)
 - More dynamic than top-k as it adapts to the confidence distribution
 
 #### Top-K Sampling
 - [random(top:seed:)](https://developer.apple.com/documentation/foundationmodels/generationoptions/samplingmode/random(top:seed:))
 - Limits token selection to the K most likely tokens
-- Planned parameter: `--top-k <value>` (integer value)
+- Usage: `--randomness "random:top-k=50"` (integer value)
 - Provides consistent limitation regardless of probability distribution
 
-These will extend the current `--randomness` parameter to provide more granular control over the sampling behavior.
+#### Stop Sequences
+- Specify strings where the model should stop generating text
+- CLI Parameter: `--stop "###,END"` (comma-separated list of stop sequences)
+- API Parameter: `"stop": ["###", "END"]` (array of strings)
+- When any stop sequence is encountered, generation stops at that point
+- The stop sequence itself is excluded from the output
+- Multiple stop sequences can be specified
+- Works in both streaming and non-streaming modes
+
+These parameters provide granular control over the sampling behavior and output formatting.
 
 ### Implementation Notes
-- Temperature: Controls randomness/creativity in responses (0.0 = deterministic, 1.0 = highly creative)
-- Randomness: "greedy" for deterministic output, "random" for varied output
+- **Temperature**: Controls randomness/creativity in responses (0.0 = deterministic, 1.0 = highly creative)
+- **Randomness**: "greedy" for deterministic output, "random" for varied output, or advanced sampling modes
+- **Stop Sequences**: Truncate output at specified strings, useful for structured output formats
 - Apple defaults are used when parameters are not specified
 - All parameters are optional and validated at CLI parsing level
+- Stop sequences from CLI and API requests are merged, with duplicates removed
 
 ### Build Commands
 ```bash
@@ -75,13 +86,25 @@ Debug logging shows:
 # Test with temperature only
 ./afm -t 1.0 -s "test prompt"
 
-# Test with both parameters
+# Test with temperature and randomness
 ./afm -t 0.5 -r greedy -s "test prompt"
 
+# Test with top-p sampling
+./afm -r "random:top-p=0.9" -s "test prompt"
+
+# Test with top-k sampling
+./afm -r "random:top-k=50" -s "test prompt"
+
+# Test with stop sequences
+./afm --stop "###,END" -s "Write a story. ###"
+
+# Test with multiple parameters
+./afm -t 0.7 -r "random:top-p=0.95" --stop "---" -s "test prompt"
+
 # Test validation (should fail)
 ./afm -t 1.5 -s "test prompt"  # Temperature out of range
 ./afm -r invalid -s "test prompt"  # Invalid randomness value
 
 # Test with debug logging
-AFM_DEBUG=1 ./afm -t 1.0 -r greedy -s "test prompt"
+AFM_DEBUG=1 ./afm -t 1.0 -r greedy --stop "###" -s "test prompt"
 ```
diff --git a/Sources/MacLocalAPI/Controllers/ChatCompletionsController.swift b/Sources/MacLocalAPI/Controllers/ChatCompletionsController.swift
@@ -8,14 +8,16 @@ struct ChatCompletionsController: RouteCollection {
     private let temperature: Double?
     private let randomness: String?
     private let permissiveGuardrails: Bool
-
-    init(streamingEnabled: Bool = true, instructions: String = "You are a helpful assistant", adapter: String? = nil, temperature: Double? = nil, randomness: String? = nil, permissiveGuardrails: Bool) {
+    private let stop: String?
+
+    init(streamingEnabled: Bool = true, instructions: String = "You are a helpful assistant", adapter: String? = nil, temperature: Double? = nil, randomness: String? = nil, permissiveGuardrails: Bool, stop: String? = nil) {
         self.streamingEnabled = streamingEnabled
         self.instructions = instructions
         self.adapter = adapter
         self.temperature = temperature
         self.randomness = randomness
         self.permissiveGuardrails = permissiveGuardrails
+        self.stop = stop
     }
     func boot(routes: RoutesBuilder) throws {
         let v1 = routes.grouped("v1")
@@ -55,8 +57,9 @@ struct ChatCompletionsController: RouteCollection {
             // Use temperature from API request if provided, otherwise use CLI parameter
             let effectiveTemperature = chatRequest.temperature ?? temperature
             let effectiveRandomness = randomness
+            let effectiveStop = mergeStopSequences(cliStop: stop, apiStop: chatRequest.stop)
 
-            let content = try await foundationService.generateResponse(for: chatRequest.messages, temperature: effectiveTemperature, randomness: effectiveRandomness)
+            let content = try await foundationService.generateResponse(for: chatRequest.messages, temperature: effectiveTemperature, randomness: effectiveRandomness, stop: effectiveStop)
 
             let promptTokens = estimateTokens(for: chatRequest.messages)
             let completionTokens = estimateTokens(for: content)
@@ -123,7 +126,31 @@ struct ChatCompletionsController: RouteCollection {
 
         return Int(max(charBasedTokens, wordBasedTokens))
     }
-
+
+    private func mergeStopSequences(cliStop: String?, apiStop: [String]?) -> [String]? {
+        var mergedStop: [String] = []
+
+        // Parse CLI stop parameter (comma-separated)
+        if let cliStopString = cliStop {
+            let cliStopArray = cliStopString.split(separator: ",").map { String($0.trimmingCharacters(in: .whitespaces)) }
+            mergedStop.append(contentsOf: cliStopArray)
+        }
+
+        // Add API stop sequences
+        if let apiStopArray = apiStop {
+            mergedStop.append(contentsOf: apiStopArray)
+        }
+
+        // Return nil if empty, otherwise return unique sequences
+        if mergedStop.isEmpty {
+            return nil
+        }
+
+        // Remove duplicates while preserving order
+        var seen = Set<String>()
+        return mergedStop.filter { seen.insert($0).inserted }
+    }
+
     private func createStreamingResponse(req: Request, chatRequest: ChatCompletionRequest, foundationService: FoundationModelService) async throws -> Response {
         let httpResponse = Response(status: .ok)
         httpResponse.headers.add(name: .contentType, value: "text/event-stream")
@@ -142,9 +169,10 @@ struct ChatCompletionsController: RouteCollection {
                 // Use temperature from API request if provided, otherwise use CLI parameter
                 let effectiveTemperature = chatRequest.temperature ?? self.temperature
                 let effectiveRandomness = self.randomness
+                let effectiveStop = self.mergeStopSequences(cliStop: self.stop, apiStop: chatRequest.stop)
 
                 // Get response with proper timing measurement
-                let (content, promptTime) = try await foundationService.generateStreamingResponseWithTiming(for: chatRequest.messages, temperature: effectiveTemperature, randomness: effectiveRandomness)
+                let (content, promptTime) = try await foundationService.generateStreamingResponseWithTiming(for: chatRequest.messages, temperature: effectiveTemperature, randomness: effectiveRandomness, stop: effectiveStop)
 
                 // Start streaming timing
                 let completionStartTime = Date()

diff --git a/Sources/MacLocalAPI/Models/FoundationModelService.swift b/Sources/MacLocalAPI/Models/FoundationModelService.swift
@@ -228,18 +228,18 @@ class FoundationModelService {
         #endif
     }
 
-    func generateResponse(for messages: [Message], temperature: Double? = nil, randomness: String? = nil) async throws -> String {
+    func generateResponse(for messages: [Message], temperature: Double? = nil, randomness: String? = nil, stop: [String]? = nil) async throws -> String {
         #if canImport(FoundationModels) && !DISABLE_FOUNDATION_MODELS
         guard let session = session else {
             throw FoundationModelError.sessionCreationFailed
         }
-        
+
         let prompt = formatMessagesAsPrompt(messages)
-        
+
         do {
             let options = try createGenerationOptions(temperature: temperature, randomness: randomness)
             let response = try await session.respond(to: prompt, options: options)
-            return response.content
+            return applyStopSequences(to: response.content, stopSequences: stop)
         } catch {
             throw FoundationModelError.responseGenerationFailed(error.localizedDescription)
         }
@@ -248,21 +248,21 @@ class FoundationModelService {
         #endif
     }
 
-    func generateStreamingResponseWithTiming(for messages: [Message], temperature: Double? = nil, randomness: String? = nil) async throws -> (content: String, promptTime: Double) {
+    func generateStreamingResponseWithTiming(for messages: [Message], temperature: Double? = nil, randomness: String? = nil, stop: [String]? = nil) async throws -> (content: String, promptTime: Double) {
         #if canImport(FoundationModels) && !DISABLE_FOUNDATION_MODELS
         guard let session = session else {
             throw FoundationModelError.sessionCreationFailed
         }
-        
+
         let prompt = formatMessagesAsPrompt(messages)
-        
+
         // Measure actual Foundation Model processing time
         let promptStartTime = Date()
         let options = try createGenerationOptions(temperature: temperature, randomness: randomness)
         let response = try await session.respond(to: prompt, options: options)
         let promptTime = Date().timeIntervalSince(promptStartTime)
-        
-        let content = response.content
+
+        let content = applyStopSequences(to: response.content, stopSequences: stop)
 
         // Handle empty or nil content
         guard !content.isEmpty else {
@@ -602,7 +602,33 @@ class FoundationModelService {
         }
     }
     #endif
-
+
+    private func applyStopSequences(to content: String, stopSequences: [String]?) -> String {
+        guard let stopSequences = stopSequences, !stopSequences.isEmpty else {
+            return content
+        }
+
+        var shortestStopIndex: String.Index? = nil
+        var foundStop = false
+
+        // Find the earliest occurrence of any stop sequence
+        for stopSeq in stopSequences {
+            if let range = content.range(of: stopSeq) {
+                foundStop = true
+                if shortestStopIndex == nil || range.lowerBound < shortestStopIndex! {
+                    shortestStopIndex = range.lowerBound
+                }
+            }
+        }
+
+        // If a stop sequence was found, truncate the content
+        if foundStop, let stopIndex = shortestStopIndex {
+            return String(content[..<stopIndex])
+        }
+
+        return content
+    }
+
     static func isAvailable() -> Bool {
         #if canImport(FoundationModels) && !DISABLE_FOUNDATION_MODELS
         return true

diff --git a/Sources/MacLocalAPI/Server.swift b/Sources/MacLocalAPI/Server.swift
@@ -17,8 +17,9 @@ class Server {
     private let temperature: Double?
     private let randomness: String?
     private let permissiveGuardrails: Bool
-
-    init(port: Int, hostname: String, verbose: Bool, streamingEnabled: Bool, instructions: String, adapter: String? = nil, temperature: Double? = nil, randomness: String? = nil, permissiveGuardrails: Bool = false) async throws {
+    private let stop: String?
+
+    init(port: Int, hostname: String, verbose: Bool, streamingEnabled: Bool, instructions: String, adapter: String? = nil, temperature: Double? = nil, randomness: String? = nil, permissiveGuardrails: Bool = false, stop: String? = nil) async throws {
         self.port = port
         self.hostname = hostname
         self.verbose = verbose
@@ -28,7 +29,8 @@ class Server {
         self.temperature = temperature
         self.randomness = randomness
         self.permissiveGuardrails = permissiveGuardrails
-
+        self.stop = stop
+
         // Create environment without command line arguments to prevent Vapor from parsing them
         var env = Environment(name: "development", arguments: ["afm"])
         try LoggingSystem.bootstrap(from: &env)
@@ -72,7 +74,7 @@ class Server {
             )
         }
 
-        let chatController = ChatCompletionsController(streamingEnabled: streamingEnabled, instructions: instructions, adapter: adapter, temperature: temperature, randomness: randomness, permissiveGuardrails: permissiveGuardrails)
+        let chatController = ChatCompletionsController(streamingEnabled: streamingEnabled, instructions: instructions, adapter: adapter, temperature: temperature, randomness: randomness, permissiveGuardrails: permissiveGuardrails, stop: stop)
         try app.register(collection: chatController)
     }
 

diff --git a/Sources/MacLocalAPI/main.swift b/Sources/MacLocalAPI/main.swift
@@ -43,10 +43,13 @@ struct ServeCommand: ParsableCommand {
 
     @Option(name: [.short, .long], help: "Sampling mode: 'greedy', 'random', 'random:top-p=<0.0-1.0>', 'random:top-k=<int>', with optional ':seed=<int>'")
     var randomness: String?
-    
+
     @Flag(name: [.customShort("P"), .long], help: "Permissive guardrails for unsafe or inappropriate responses")
     var permissiveGuardrails: Bool = false
 
+    @Option(name: .long, help: "Stop sequences - comma-separated strings where generation should stop (e.g., '###,END')")
+    var stop: String?
+
     func run() throws {
         // Validate temperature parameter
         if let temp = temperature {
@@ -80,7 +83,7 @@ struct ServeCommand: ParsableCommand {
         // Start server in async context
         _ = Task {
             do {
-                let server = try await Server(port: port, hostname: hostname, verbose: verbose, streamingEnabled: !noStreaming, instructions: instructions, adapter: adapter, temperature: temperature, randomness: randomness, permissiveGuardrails: permissiveGuardrails)
+                let server = try await Server(port: port, hostname: hostname, verbose: verbose, streamingEnabled: !noStreaming, instructions: instructions, adapter: adapter, temperature: temperature, randomness: randomness, permissiveGuardrails: permissiveGuardrails, stop: stop)
                 globalServer = server
                 try await server.start()
             } catch {
@@ -154,10 +157,13 @@ struct RootCommand: ParsableCommand {
 
     @Option(name: [.short, .long], help: "Sampling mode: 'greedy', 'random', 'random:top-p=<0.0-1.0>', 'random:top-k=<int>', with optional ':seed=<int>'")
     var randomness: String?
-    
+
     @Flag(name: [.customShort("P"), .long], help: "Permissive guardrails for unsafe or inappropriate responses")
     var permissiveGuardrails: Bool = false
 
+    @Option(name: .long, help: "Stop sequences - comma-separated strings where generation should stop (e.g., '###,END')")
+    var stop: String?
+
     func run() throws {
         // Validate temperature parameter
         if let temp = temperature {
@@ -198,6 +204,7 @@ struct RootCommand: ParsableCommand {
         serveCommand.temperature = temperature
         serveCommand.randomness = randomness
         serveCommand.permissiveGuardrails = permissiveGuardrails
+        serveCommand.stop = stop
         try serveCommand.run()
     }
 }
@@ -266,7 +273,7 @@ extension RootCommand {
 
     private func runSinglePrompt(_ prompt: String, adapter: String?) throws {
         DebugLogger.log("Starting single prompt mode with prompt: '\(prompt)'")
-        DebugLogger.log("Temperature: \(temperature?.description ?? "nil"), Randomness: \(randomness ?? "nil")")
+        DebugLogger.log("Temperature: \(temperature?.description ?? "nil"), Randomness: \(randomness ?? "nil"), Stop: \(stop ?? "nil")")
 
         let group = DispatchGroup()
         var result: Result<String, Error>?
@@ -280,7 +287,8 @@ extension RootCommand {
                     DebugLogger.log("FoundationModelService initialized successfully")
                     let message = Message(role: "user", content: prompt)
                     DebugLogger.log("Generating response...")
-                    let response = try await foundationService.generateResponse(for: [message], temperature: temperature, randomness: randomness)
+                    let stopSequences = stop?.split(separator: ",").map { String($0.trimmingCharacters(in: .whitespaces)) }
+                    let response = try await foundationService.generateResponse(for: [message], temperature: temperature, randomness: randomness, stop: stopSequences)
                     DebugLogger.log("Response generated successfully")
                     result = .success(response)
                 } else {