forked from meta-introspector/nice-parser
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
mike dupont
committed
Dec 22, 2023
1 parent
17b7133
commit 53edbad
Showing
14 changed files
with
2,607 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ | |
(public_name gbnf_parser) | ||
(name main) | ||
(libraries gbnf_parser)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,19 @@ | ||
open Gbnf_parser | ||
let () = print_endline "Hello, World!" | ||
|
||
let grammar_from_channel ic = | ||
let lexbuf = Lexing.from_channel ic in | ||
let buffer, lexer = MenhirLib.ErrorReports.wrap SentenceLexer.main in | ||
let dat = (SentenceParser.grammar lexer lexbuf) in | ||
(print_endline (Batteries.dump dat)) | ||
|
||
let grammar_from_file filename = | ||
let ic = open_in filename in | ||
let g = grammar_from_channel ic in | ||
let () = close_in ic in | ||
g | ||
|
||
let () = | ||
let filename = Sys.argv.(1) in | ||
let g = grammar_from_file filename in | ||
(print_endline (Batteries.dump g)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
(env | ||
(dev | ||
(flags (:standard -warn-error -9)))) | ||
(dev | ||
(flags | ||
(:standard -warn-error -9)))) | ||
|
||
(library | ||
(name grammar)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
the grammars are taken from | ||
https://github.com/ggerganov/llama.cpp | ||
|
||
# GBNF Guide | ||
|
||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`. | ||
|
||
## Background | ||
|
||
[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features. | ||
|
||
## Basics | ||
|
||
In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`. | ||
|
||
## Example | ||
|
||
Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar: | ||
``` | ||
# `root` specifies the pattern for the overall output | ||
root ::= ( | ||
# it must start with the characters "1. " followed by a sequence | ||
# of characters that match the `move` rule, followed by a space, followed | ||
# by another move, and then a newline | ||
"1. " move " " move "\n" | ||
# it's followed by one or more subsequent moves, numbered with one or two digits | ||
([1-9] [0-9]? ". " move " " move "\n")+ | ||
) | ||
# `move` is an abstract representation, which can be a pawn, nonpawn, or castle. | ||
# The `[+#]?` denotes the possibility of checking or mate signs after moves | ||
move ::= (pawn | nonpawn | castle) [+#]? | ||
pawn ::= ... | ||
nonpawn ::= ... | ||
castle ::= ... | ||
``` | ||
|
||
## Non-Terminals and Terminals | ||
|
||
Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`. | ||
|
||
Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`. | ||
|
||
## Characters and character ranges | ||
|
||
Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`). | ||
|
||
Character ranges can be negated with `^`: | ||
``` | ||
single-line ::= [^\n]+ "\n"` | ||
``` | ||
|
||
## Sequences and Alternatives | ||
|
||
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc. | ||
|
||
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`. | ||
|
||
Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optional symbols (below) to a sequence. | ||
|
||
## Repetition and Optional Symbols | ||
|
||
- `*` after a symbol or sequence means that it can be repeated zero or more times. | ||
- `+` denotes that the symbol or sequence should appear one or more times. | ||
- `?` makes the preceding symbol or sequence optional. | ||
|
||
## Comments and newlines | ||
|
||
Comments can be specified with `#`: | ||
``` | ||
# defines optional whitespace | ||
ws ::= [ \t\n]+ | ||
``` | ||
|
||
Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses. | ||
|
||
## The root rule | ||
|
||
In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match. | ||
|
||
``` | ||
# a grammar for lists | ||
root ::= ("- " item)+ | ||
item ::= [^\n]+ "\n" | ||
``` | ||
|
||
## Next steps | ||
|
||
This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with: | ||
``` | ||
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
root ::= (expr "=" ws term "\n")+ | ||
expr ::= term ([-+*/] term)* | ||
term ::= ident | num | "(" ws expr ")" ws | ||
ident ::= [a-z] [a-z0-9_]* ws | ||
num ::= [0-9]+ ws | ||
ws ::= [ \t\n]* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
root ::= (declaration)* | ||
|
||
declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}" | ||
|
||
dataType ::= "int" ws | "float" ws | "char" ws | ||
identifier ::= [a-zA-Z_] [a-zA-Z_0-9]* | ||
|
||
parameter ::= dataType identifier | ||
|
||
statement ::= | ||
( dataType identifier ws "=" ws expression ";" ) | | ||
( identifier ws "=" ws expression ";" ) | | ||
( identifier ws "(" argList? ")" ";" ) | | ||
( "return" ws expression ";" ) | | ||
( "while" "(" condition ")" "{" statement* "}" ) | | ||
( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) | | ||
( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) | | ||
( singleLineComment ) | | ||
( multiLineComment ) | ||
|
||
forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression | ||
forUpdate ::= identifier ws "=" ws expression | ||
|
||
condition ::= expression relationOperator expression | ||
relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">") | ||
|
||
expression ::= term (("+" | "-") term)* | ||
term ::= factor(("*" | "/") factor)* | ||
|
||
factor ::= identifier | number | unaryTerm | funcCall | parenExpression | ||
unaryTerm ::= "-" factor | ||
funcCall ::= identifier "(" argList? ")" | ||
parenExpression ::= "(" ws expression ws ")" | ||
|
||
argList ::= expression ("," ws expression)* | ||
|
||
number ::= [0-9]+ | ||
|
||
singleLineComment ::= "//" [^\n]* "\n" | ||
multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/" | ||
|
||
ws ::= ([ \t\n]+) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# Specifies chess moves as a list in algebraic notation, using PGN conventions | ||
|
||
# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern | ||
root ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+ | ||
move ::= (pawn | nonpawn | castle) [+#]? | ||
|
||
# piece type, optional file/rank, optional capture, dest file & rank | ||
nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8] | ||
|
||
# optional file & capture, dest file & rank, optional promotion | ||
pawn ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])? | ||
|
||
castle ::= "O-O" "-O"? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
letter ::= "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" | ||
|
||
digit ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | ||
# removed " | "\f" | "\b" | ||
symbol ::= "[" | "]" | "{" | "}" | "(" | ")" | "<" | ">" | "'" | "=" | "|" | "." | "," | ";" | "-" | "+" | "*" | "?" | "\n" | "\t" | "\r" | ||
|
||
character ::= letter | digit | symbol | "_" | " " | ||
identifier ::= letter ( letter | digit | "_" ) | ||
|
||
#| "\f" | "\b" | ||
S ::= ( " " | "\n" | "\t" | "\r" ) | ||
|
||
terminal ::= "'" character "'" ( character "'" ) "'" | ||
|
||
terminator ::= (";" | ".") | ||
|
||
term ::= "(" S rhs S ")" | "[" S rhs S "]" | "{" S rhs S "}" | terminal | identifier | ||
|
||
factor ::= term S "?" | term S "*" | term S "+" | term S "-" S term | term S | ||
|
||
concatenation ::= ( S factor S "," ? ) + | ||
alternation ::= ( S concatenation S "|" ? ) + | ||
|
||
rhs ::= alternation | ||
lhs ::= identifier | ||
|
||
rule ::= lhs S "=" S rhs S terminator | ||
|
||
root ::= ( S rule S ) * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
comma ::= "," | ||
term ::= forall binders comma term | ||
| fun binders "=>" term | ||
| fix fix_bodies | ||
| cofix cofix_bodies | ||
| let ident "[" binders "]" "[:" term"]" ":=" term "in" term | ||
| let fix fix_body "in" term | ||
| let cofix cofix_body in term | ||
| let ( [name , … , name] ) [dep_ret_type] := term in term | ||
| let ' pattern [in term] := term [return_type] in term | ||
| if term [dep_ret_type] then term else term | ||
| term : term | ||
| term <: term | ||
| term :> | ||
| term -> term | ||
| term arg … arg | ||
| @ qualid [term … term] | ||
| term % ident | ||
| match match_item , … , match_item [return_type] with | ||
[[|] equation | … | equation] end | ||
| qualid | ||
| sort | ||
| num | ||
| _ | ||
| ( term ) | ||
arg ::= term | ||
| ( ident := term ) | ||
binders ::= binder … binder | ||
binder ::= name | ||
| ( name … name : term ) | ||
| ( name [: term] := term ) | ||
| ' pattern | ||
name ::= ident | _ | ||
qualid ::= ident | qualid access_ident | ||
sort ::= Prop | Set | Type | ||
fix_bodies ::= fix_body | ||
| fix_body with fix_body with … with fix_body for ident | ||
cofix_bodies ::= cofix_body | ||
| cofix_body with cofix_body with … with cofix_body for ident | ||
fix_body ::= ident binders [annotation] [: term] := term | ||
cofix_body ::= ident [binders] [: term] := term | ||
annotation ::= { struct ident } | ||
match_item ::= term [as name] [in qualid [pattern … pattern]] | ||
dep_ret_type ::= [as name] return_type | ||
return_type ::= return term | ||
equation ::= mult_pattern | … | mult_pattern => term | ||
mult_pattern ::= pattern , … , pattern | ||
pattern ::= qualid pattern … pattern | ||
| @ qualid pattern … pattern | ||
| pattern as ident | ||
| pattern % ident | ||
| qualid | ||
| _ | ||
| num | ||
| ( or_pattern , … , or_pattern ) | ||
or_pattern ::= pattern | … | pattern |
Oops, something went wrong.