Add 100 Samples Per Regex / JSON Schema

dottxt-ai · Oct 11, 2024 · a199b7c · a199b7c
1 parent 717b3cc
commit a199b7c
Show file tree

Hide file tree

Showing 11 changed files with 60 additions and 40 deletions.
diff --git a/src/benchmark_lfe.py b/src/benchmark_lfe.py
@@ -26,14 +26,15 @@ def setup(self, model, _):
  self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer)
 
  def time_lfe(self, _, regex):
- regex_string, regex_example = regex["regex"], regex["example"]
- regex_example_tokens = self.tokenizer.encode(regex_example)
+ regex_string, regex_samples = regex["regex"], regex["samples"]
 
  parser = RegexParser(regex_string)
  token_enforcer = TokenEnforcer(self.tokenizer_data, parser)
 
- for i in range(len(regex_example_tokens)):
- _ = token_enforcer.get_allowed_tokens(regex_example_tokens[: i + 1])
+ for regex_sample in regex_samples:
+ regex_sample_tokens = self.tokenizer.encode(regex_sample)
+ for i in range(len(regex_sample_tokens)):
+ _ = token_enforcer.get_allowed_tokens(regex_sample_tokens[: i + 1])
 
 
 class LMFormatEnforcerJsonSchema:
@@ -54,11 +55,12 @@ def setup(self, model, _):
  self.tokenizer_data = build_token_enforcer_tokenizer_data(self.tokenizer)
 
  def time_lfe(self, _, json):
- json_string, json_example = json["schema"], json["example"]
- json_example_tokens = self.tokenizer.encode(json_example)
+ json_string, json_samples = json["schema"], json["samples"]
 
  parser = JsonSchemaParser(json_string)
  token_enforcer = TokenEnforcer(self.tokenizer_data, parser)
 
- for i in range(len(json_example_tokens)):
- _ = token_enforcer.get_allowed_tokens(json_example_tokens[: i + 1])
+ for json_sample in json_samples:
+ json_sample_tokens = self.tokenizer.encode(json_sample)
+ for i in range(len(json_sample_tokens)):
+ _ = token_enforcer.get_allowed_tokens(json_sample_tokens[: i + 1])
diff --git a/src/benchmark_outlines.py b/src/benchmark_outlines.py
@@ -37,14 +37,15 @@ def time_outlines(self, _, regex):
  """
  caching.clear_cache()
 
- regex_string, regex_example = regex["regex"], regex["example"]
- regex_example_tokens = self.tokenizer.encode(regex_example)[0][0]
+ regex_string, regex_samples = regex["regex"], regex["samples"]
  guide = RegexGuide(regex_string, self.tokenizer)
 
- state = 0
- for token in regex_example_tokens:
- _ = guide.get_next_instruction(state)
- state = guide.get_next_state(state, token)
+ for regex_sample in regex_samples:
+ regex_sample_tokens = self.tokenizer.encode(regex_sample)[0][0]
+ state = guide.initial_state
+ for token in regex_sample_tokens:
+ _ = guide.get_next_instruction(state)
+ state = guide.get_next_state(state, token)
 
 
 class OutlinesJsonSchema:
@@ -72,13 +73,14 @@ def time_outlines(self, _, json_case):
  regular expression, and walking this index while generating tokens.
 
  """
- json_string, json_example = json_case["schema"], json_case["example"]
- json_example_tokens = self.tokenizer.encode(json_example)[0][0]
+ json_string, json_samples = json_case["schema"], json_case["samples"]
 
  regex_string = build_regex_from_schema(json.dumps(json_string))
  guide = RegexGuide(regex_string, self.tokenizer)
 
- state = 0
- for token in json_example_tokens:
- _ = guide.get_next_instruction(state)
- state = guide.get_next_state(state, token)
+ for json_sample in json_samples:
+ json_sample_tokens = self.tokenizer.encode(json_samples)[0][0]
+ state = guide.initial_state
+ for token in json_sample_tokens:
+ _ = guide.get_next_instruction(state)
+ state = guide.get_next_state(state, token)
diff --git a/src/benchmark_outlines_core.py b/src/benchmark_outlines_core.py
@@ -32,14 +32,15 @@ def time_outlines_core(self, _, regex):
  regular expression, and walking this index while generating tokens.
 
  """
- regex_string, regex_example = regex["regex"], regex["example"]
- regex_example_tokens = self.tokenizer.encode(regex_example)[0][0]
+ regex_string, regex_samples = regex["regex"], regex["samples"]
  guide = RegexGuide(regex_string, self.tokenizer)
 
- state = 0
- for token in regex_example_tokens:
- _ = guide.get_next_instruction(state)
- state = guide.get_next_state(state, token)
+ for regex_sample in regex_samples:
+ regex_sample_tokens = self.tokenizer.encode(regex_sample)[0][0]
+ state = guide.initial_state
+ for token in regex_sample_tokens:
+ _ = guide.get_next_instruction(state)
+ state = guide.get_next_state(state, token)
 
 
 class OutlinesCoreJsonSchema:
@@ -66,13 +67,14 @@ def time_outlines_core(self, _, json_case):
  regular expression, and walking this index while generating tokens.
 
  """
- json_string, json_example = json_case["schema"], json_case["example"]
- json_example_tokens = self.tokenizer.encode(json_example)[0][0]
+ json_string, json_samples = json_case["schema"], json_case["samples"]
 
  regex_string = build_regex_from_schema(json.dumps(json_string))
  guide = RegexGuide(regex_string, self.tokenizer)
 
- state = 0
- for token in json_example_tokens:
- _ = guide.get_next_instruction(state)
- state = guide.get_next_state(state, token)
+ for json_sample in json_samples:
+ json_sample_tokens = self.tokenizer.encode(json_samples)[0][0]
+ state = guide.initial_state
+ for token in json_sample_tokens:
+ _ = guide.get_next_instruction(state)
+ state = guide.get_next_state(state, token)
diff --git a/src/data.py b/src/data.py
@@ -1,38 +1,44 @@
+import json
+
+
 models = [
  "NousResearch/Nous-Hermes-llama-2-7b", # 32,000 tokens vocabulary
  "gpt2", # 50,257 tokens vocabulary
  "NousResearch/Hermes-3-Llama-3.1-8B", # 128,256 tokens vocabulary
  "unsloth/gemma-2-2b-it-bnb-4bit", # 256,128 tokens vocabulary
 ]
 
+
 regex_cases = [
  {
  "name": "Phone Number",
- "regex": r'\d{3}-\d{2}-\d{4}',
- "example": '203-22-1234'
+ "regex": r'\d{3}-\d{3}-\d{4}',
+ "samples": json.load("samples/phone_number.json"), # randomly generated
  },
  {
  "name": "URL",
  "regex": r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?',
- "example": 'https://github.com/outlines-dev/outlines'
+ "samples": json.load("samples/url.json"),
  },
  {
  "name": "GSM8K",
  "regex": r'A: [\w \.\*\-=\+,\?/]{10,50}\. The answer is [1-9][0-9]{0,9}\.',
- "example": 'A: Some thoughts before answering. The answer is 42.'
+ "samples": json.load("samples/gsm8k.json"),
  },
  {
  "name": "Complex string",
  "regex": r'(0|[1-9][0-9]*)|true|false|([a-zA-Z_][a-zA-Z_0-9]*)',
- "example": 'AVeryLongStringtoTest1234'
+ "samples": json.load("samples/complex_str.json"),
+
  },
  {
  "name": "Long integer",
  "regex": r'\+[1-9]\d{1,14}',
- "example": '1234567891234'
+ "samples": json.load("samples/long_integer.jsono"),
  }
 ]
 
+
 json_cases = [
  {
  "name": "RPG character",
@@ -55,7 +61,7 @@
  "title": "Character",
  "type": "object",
  },
- "example": """{'name': 'Super Warrior', 'age': 26, 'armor': 'leather', 'armor': 10}""",
+ "samples": map(json.dumps, json.load("samples/rpg_characters.json")),
  },
  {
  "name": "Simple nested schema",
@@ -91,6 +97,6 @@
  },
  "required": ["id", "work", "recording_artists"],
  },
- "example": """{'id': 999, 'work': {'id': 1, 'name': 'Strasbourg Saint-Denis', 'composer': 'Roy Hargrove'}, 'recording_artists': [{'id': 2, 'name': 'Roy Hargrove', 'functions': ['Trumpet', 'Singing']}]}""",
+ "samples": map(json.dumps, json.load("samples/recording_schema.json")),
  },
 ]
diff --git a/src/samples/complex_str.json b/src/samples/complex_str.json
@@ -0,0 +1 @@
+["falseJb", "false0IUnuntrueb_LgozC2VciR4TSU", "truefalsetruetrue", "60falsetrue0truefalsefalseucfalse", "GG6LOxmjtrue0", "ybZXifalsetrueIi3Ftrue", "falsetrueDlled4UiW0trueZJsNUjKfs", "4", "falseWkZpEfalse19falsefalsefalsetrue0", "true41falsectrue0falsetrue", "false11trues82", "true39falsetrue73true0falsetrue", "NDd9falsedjy3fGB", "true", "true58true0truefalse", "falset77_ZzU0sFE42etrue0D9", "52sYo7oF8YDPtrueu5x7eBuByBqZJNb00", "false22false8false", "falsefalsewkXfalseNl00true3", "FQJDLnTDVstku3J0X8d5RaerNJfaO96", "truefalsefalse0", "truehOfalse", "truefalseetCOz", "truefalsefalseOtruetrue0LHT0false", "0falsefalseWW3FVDzctruetruetrue", "false", "truetruefalsetruepZWNyfi12bkU", "truefalseigp1ub3", "truetrue64falsefalse", "SWAOdHtBfalse37UHNc1hlfAX_hEfalse7", "truetruefalseUMaFReibQfalse76_X8MWwTkRZfalse", "098truefalsetrue", "VNB0040rRrSOG048false", "xZRfalsetruetrue", "42", "falsetruetrueR", "Afalsetrue", "false0MUqmMTNtrueQLfwxtruetruetruefalse", "truearAcBkNR426yPWtruefalse37YgMuFwC2nfalse0", "uDvnVfalse2falsefalse24", "1iEwsRFaXzPj", "falsetruefalse40truefalsefalsetrueCVTyhXpeufalse", "iVdgt2_24trueO57", "un7HivkLu360falseXI9dfg0BEU53izLz11falsefalse", "Wfalse050false0", "grRmM2N7R0iQ0falsefalse0amOajE", "falsefalsefalsed70", "Fodkhdk1rXfalseV8fBRtrue034", "truek9gRy7Ll0qPx4gXTY_W", "truedI5xBI3cTi6", "falsetrueK541tVn1kofalsetrue", "QlWNmtruefalsevHmJX8i00falseap", "QuKVqKdmCfalsemx5RWQa", "R92truefalsetrue", "pRQW6krgFtrue0kqT96b2truelUarMp30v_68w66", "Nnfalse0", "false", "kVnGMeGfalse", "false", "533true096zyxAZXkVsV", "ZHGMEshg67", "27truefalsetrue", "0truetruetrueE1xfalse67", "false67DOVb2Ohcfalse", "false", "83truefalsefalsefalsefalse1843HAUZ", "falsetrueSgcSyFrMLtrue", "57truezn23BcwaTfalse", "BQsQY2W18false14", "false148trueIBjNKK7mWY", "036", "falsecZYAdPOjGkofalse2falsemN3ktruefalsetrue", "87", "true0falsefalse", "wtruetruetruetrue0true0", "truetruefalsetruefalsefalsetUxqN3BsGJ0", "trueryEUpMh0_UQjnA5AOhP6519", "falsefalsetrue0truetruefalsehLE2itrue57", "truefalsefalsefalse350", "zOyG0truefalsefalse", "falseYlQfg", "21Og", "0false00falsexA2mX7true", "96truetrue", "SFtg_HU_5GvFAkP0Pxw8K5ftruefalse", "ZtruGMWBuItruectW510falsetrue", "trueBvPmtrue00", "_fDdoTYwtrue", "295true97", "true", "false", "false0H0true000", "93ClaPkD41h7false76falsemtruePrRNiFsAOcY1YxC", "falsefalse170truerfmQImvL0", "false1", "dpvYu", "false0", "0true", "trueDpQs13MtrueEl9619true", "true46"]
diff --git a/src/samples/gsm8k.json b/src/samples/gsm8k.json
@@ -0,0 +1 @@
+["Thus, there were 36 - 12 - 9. The answer is 42.", "So she must be applying makeup to 12 tubes of lip gloss * 3 people per tube of lip gloss. The answer is 42.", "To fix the rest of the bike as well, she needs 20 + 5. The answer is 42.", "Thus, she had X. The answer is 42.", "During two days the boat can transport a total of 48 * 2. The answer is 42.", "In total, they have 140+180. The answer is 42.", "So he got 10*5. The answer is 42.", "So from Monday through Sunday, there are 20 + 25 + 22 + 20 + 10. The answer is 42.", "Henry spent a total of 35+6. The answer is 42.", "Then we will divide each side by 5 to figure out how many nuggets Alyssa ate, 100/5. The answer is 42.", "If Tina sold $210 worth of candy and Marvin sold $70 in candy then 210 - 70. The answer is 42.", "That means he had 15 - 9. The answer is 42.", "After adding the photos they took on the second day to the galley, the number of photos will be 600+320. The answer is 42.", "Adding those marbles to the 20 she had before, she ends up with 40 marbles + 20 marbles. The answer is 42.", "Therefore, in total Ed has 5+10. The answer is 42.", "There are 3000/3*4. The answer is 42.", "The electric bill needs an extra stamp, so she needs 20 + 1. The answer is 42.", "This leaves 24 clownfish \u2013 8 clownfish. The answer is 42.", "Then add the initial number of objects she could juggle to find the total: 10 objects + 3 objects. The answer is 42.", "Adding the cupcakes for the P.E. class 50, she needs to make a total of 90 cupcakes + 50 cupcakes. The answer is 42.", "The bike\u2019s cost must be subtracted from her earnings made in a month to get how much money is left over after purchasing the bike: $1120 - $400. The answer is 42.", "The total time it took to fill up the tank is 4+2. The answer is 42.", "The number of board members who attended the meeting was 80-32. The answer is 42.", "The grocery store offer is $0.50 a can and the warehouse is $0.25 a can so the grocery store is .50-.25. The answer is 42.", "The second ball is twice the size of the first ball, so she used 9 * 2. The answer is 42.", "If she is currently 20 inches tall, her height after 10 years will be 20+30. The answer is 42.", "John has 5 more than 20 roommates which is 20+5. The answer is 42.", "In five minutes, the three cats meow 5*11. The answer is 42.", "Thus, the principal needs to book 30/5. The answer is 42.", "He gave away 10 cones because 50 / 5. The answer is 42.", "He was left with 110-55. The answer is 42.", "In total, Mitchell read 420+80. The answer is 42.", "So he paid 20*5.5. The answer is 42.", "This means, 36 - 27. The answer is 42.", "Therefore, their change is 500 - 426. The answer is 42.", "Dividing both sides by 4 we get x. The answer is 42.", "Therefore, Loraine used 12 + 8. The answer is 42.", "The washing machine can only hold 14 towels and they have 84 towels to wash which means there are 84/14. The answer is 42.", "So he worked for 5*8. The answer is 42.", "The library needs 20/4. The answer is 42.", "Since there is a total of 100 chocolate bars and four types of chocolate, we divide 100/4. The answer is 42.", "60 minutes are in 1 hour and it takes her 120 minutes so that\u2019s 120/60. The answer is 42.", "The total liters of fuel are 150+100. The answer is 42.", "If the size of the drift on the second day, after half of the snow had melted, was 10 inches, then the depth of the original drift created on day 1 was 2*10. The answer is 42.", "In total Ed and Jacob have 2 shells + 13 shells + 15 shells. The answer is 42.", "Donna has 200-160. The answer is 42.", "Bianca, Celeste and McClain worked a total of 3240 minutes last weekend.. The answer is 42.", "After throwing some in the trash, he now holds a total of 234 cards - 34 trashed cards. The answer is 42.", "If she needs to eat 6 more cups in the next 2 days then she will have to eat 6 cups / 2 days. The answer is 42.", "Since he got 4 cents as change it cost 135-4. The answer is 42.", "So, he will be able to finish typing the document in 810 words / 45 words/minute. The answer is 42.", "To close on a distance of 200 feet, it would take the cheetah 210/30. The answer is 42.", "So each writer from 4th to 18th will earn $330 / 15. The answer is 42.", "So, they still need to collect 100 - 73. The answer is 42.", "Therefore, she has to buy 11 - 3. The answer is 42.", "Therefore, they have 20 + 40. The answer is 42.", "That leaves 180 - 20 - 60 - 12. The answer is 42.", "This means that Sandy will have 500000 - 125000. The answer is 42.", "Jung is 2 years + 24 years. The answer is 42.", "Angie had 18 left after paying taxes, so her share of the taxes was 38- 18. The answer is 42.", "The number of pies that are not eaten with pie is 2000-1360. The answer is 42.", "Since 1 slice of pepperoni fell off Jelly's slice, this means there are 10-1. The answer is 42.", "The percentage that are green is 60 because .6 x 100%. The answer is 42.", "That means it cost him $5000 - $4000. The answer is 42.", "Donna dropped off five more than that, so she dropped off 66 + 5. The answer is 42.", "Then add 8 years to the older sister's birth year to find the grandmother's birth year: 1936 + 8. The answer is 42.", "Kris and her brother therefore blew up a total of 60 balloons from Kris + 180 balloons from Kris\u2019 brother. The answer is 42.", "Now add the court costs, lawyer fees and fine: $240 + $300 + $280. The answer is 42.", "Then find the chances of getting ill after receiving the vaccine: 30% * 50%. The answer is 42.", "At a rate of $5 per dog, Harry ears $5 * 42. The answer is 42.", "Triple the number of rabbits in the cage is 3*20. The answer is 42.", "Then subtract that number from the 27 seagulls to find the final number of seagulls: 27 seagulls - 9 seagulls. The answer is 42.", "Thus, Anne is carrying a total of 2 + 4. The answer is 42.", "There are 15 cups of lemonade in the pitcher because 12 x 1 and 1/4. The answer is 42.", "Thus, Georgia has 75 - 25. The answer is 42.", "The town has therefore added 14 trash cans on the streets + 28 trash cans at the back of stores. The answer is 42.", "She has 0 minutes left when she is done because 60 - 60. The answer is 42.", "So a table and 4 chairs cost $140 + $80. The answer is 42.", "She would need to baby-sit for $75/$15. The answer is 42.", "The total amount for 6 months is $600 * 6. The answer is 42.", "If the total number of candles was 72, then the number of blue candles is 72 candles - 45 candles. The answer is 42.", "Therefore, one deck of basketball cards costs $50/2. The answer is 42.", "Together, the three friends have 1020+640. The answer is 42.", "The total shoe size is 14 + 7. The answer is 42.", "The combined total number of stripes Vaishali has on all of her hats is 12+12+0+10. The answer is 42.", "Therefore, there are 25 - 7. The answer is 42.", "Her assignment will take 10 minutes to complete, so she needs 42 minutes + 10 minutes. The answer is 42.", "In 6 years he will be 12 + 6. The answer is 42.", "So he won 5000+70+160-400. The answer is 42.", "There are 30+20+30. The answer is 42.", "Michelangelo will have 16-3. The answer is 42.", "He can therefore make 500 meters of silk / 5 meters of silk per dress. The answer is 42.", "To determine the halfway point, we simply divide the total number of miles by 2, performing 30/2. The answer is 42.", "Compared to Veronica's flashlight, Velma's flashlight could be seen for a distance of 13,000-1,000. The answer is 42.", "For 91 days the bus system carries 1,200,000 * 91. The answer is 42.", "All the apartments can therefore accommodate 66 apartments * 4 residents/apartment. The answer is 42.", "In total, the bus carried 160+150. The answer is 42.", "Quinton then gave away 6 - 1 -1 -1 -1. The answer is 42.", "They will play 16 / 2. The answer is 42.", "100 cents make a dollar so 200 cents make 200/100. The answer is 42."]
diff --git a/src/samples/long_integer.json b/src/samples/long_integer.json
@@ -0,0 +1 @@
+["+83094", "+1795091367092", "+42", "+929089011211089", "+26130955215791", "+7767", "+6759912978443", "+583634786789972", "+579578372355278", "+44823776970", "+6611156", "+279649300470", "+612623", "+94937014091346", "+33734690589606", "+5395287444852", "+157076793951", "+9563230", "+58630060740", "+4656577", "+134", "+37020491453584", "+170341651", "+67974", "+96524", "+4286169647603", "+391294983", "+856606", "+470498555", "+96362157", "+7250135946", "+117928236897", "+7959850718640", "+34227", "+619177965217", "+871596132581349", "+4636718456036", "+98798", "+6064130279", "+32438", "+53", "+5298655", "+213581994", "+71497", "+403383173625", "+87948252", "+82513793282189", "+6020207", "+347070153", "+56010933555", "+39", "+2740435060956", "+8283", "+41", "+8117493", "+565", "+17984995066427", "+538718279249", "+3668696927193", "+659772042831", "+6260995736", "+56963135", "+58801238412", "+26", "+93904", "+753963", "+807627773250", "+9803802", "+608439537", "+55850", "+539", "+12999556691", "+250951919292", "+6821851297836", "+3998189", "+8235", "+65369436", "+821565324", "+2273941809014", "+75", "+572269", "+2109495641802", "+78", "+99", "+894", "+255608621326220", "+653254404008", "+63292", "+9465729862115", "+85919674", "+745597", "+687839", "+77267545969564", "+6063933181660", "+99", "+41489130", "+850203071070", "+384911418317915", "+6591", "+78208"]