Skip to content

Commit

Permalink
WIP: Use instructlab-schema package to parse qna.yaml files
Browse files Browse the repository at this point in the history
We remove code duplicated with instructlab to use the shared code
from instructlab-schema package.

See instructlab/schema#33

Signed-off-by: BJ Hargrave <[email protected]>
  • Loading branch information
bjhargrave committed Jul 1, 2024
1 parent 45ecc73 commit e475b9d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 224 deletions.
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,8 @@ int-import-graph=
known-standard-library=

# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
known-third-party=enchant,
instructlab.schema,

# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema @ git+https://github.com/instructlab/schema.git@refs/pull/33/head
jinja2
langchain-text-splitters
openai>=1.13.3,<2.0.0
Expand Down
248 changes: 25 additions & 223 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,30 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from functools import cache
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional, Union
from typing import Dict, List, Optional, Union
import glob
import json
import logging
import os
import re
import subprocess
import tempfile

# Third Party
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
from instructlab.schema.taxonomy import (
TaxonomyMessageFormat,
TaxonomyParser,
TaxonomyReadingException,
)
import git
import gitdb
import yaml

# First Party
from instructlab.sdg import utils
from instructlab.sdg.utils import chunking
# Local
from . import GenerateException, chunking

logger = logging.getLogger(__name__)

DEFAULT_YAML_RULES = """\
extends: relaxed
rules:
line-length:
max: 120
"""


class TaxonomyReadingException(Exception):
"""An exception raised during reading of the taxonomy."""


TAXONOMY_FOLDERS: List[str] = ["compositional_skills", "knowledge"]
"""Taxonomy folders which are also the schema names"""


def _istaxonomyfile(fn):
path = Path(fn)
Expand Down Expand Up @@ -133,215 +119,32 @@ def _get_documents(
raise e


@cache
def _load_schema(path: "importlib.resources.abc.Traversable") -> "referencing.Resource":
"""Load the schema from the path into a Resource object.
Args:
path (Traversable): Path to the schema to be loaded.
Raises:
NoSuchResource: If the resource cannot be loaded.
Returns:
Resource: A Resource containing the requested schema.
"""
# pylint: disable=C0415
# Third Party
from referencing import Resource
from referencing.exceptions import NoSuchResource
from referencing.jsonschema import DRAFT202012

try:
contents = json.loads(path.read_text(encoding="utf-8"))
resource = Resource.from_contents(
contents=contents, default_specification=DRAFT202012
)
except Exception as e:
raise NoSuchResource(ref=str(path)) from e
return resource


def _validate_yaml(contents: Mapping[str, Any], taxonomy_path: Path) -> int:
"""Validate the parsed yaml document using the taxonomy path to
determine the proper schema.
Args:
contents (Mapping): The parsed yaml document to validate against the schema.
taxonomy_path (Path): Relative path of the taxonomy yaml document where the
first element is the schema to use.
Returns:
int: The number of errors found during validation.
Messages for each error have been logged.
"""
# pylint: disable=C0415
# Standard
from importlib import resources

# Third Party
from jsonschema.protocols import Validator
from jsonschema.validators import validator_for
from referencing import Registry, Resource
from referencing.exceptions import NoSuchResource
from referencing.typing import URI

errors = 0
version = _get_version(contents)
schemas_path = resources.files("instructlab.schema").joinpath(f"v{version}")

def retrieve(uri: URI) -> Resource:
path = schemas_path.joinpath(uri)
return _load_schema(path)

schema_name = taxonomy_path.parts[0]
if schema_name not in TAXONOMY_FOLDERS:
schema_name = "knowledge" if "document" in contents else "compositional_skills"
logger.info(
f"Cannot determine schema name from path {taxonomy_path}. Using {schema_name} schema."
)

try:
schema_resource = retrieve(f"{schema_name}.json")
schema = schema_resource.contents
validator_cls = validator_for(schema)
validator: Validator = validator_cls(
schema, registry=Registry(retrieve=retrieve)
)

for validation_error in validator.iter_errors(contents):
errors += 1
yaml_path = validation_error.json_path[1:]
if not yaml_path:
yaml_path = "."
if validation_error.validator == "minItems":
# Special handling for minItems which can have a long message for seed_examples
message = (
f"Value must have at least {validation_error.validator_value} items"
)
else:
message = validation_error.message[-200:]
logger.error(
f"Validation error in {taxonomy_path}: [{yaml_path}] {message}"
)
except NoSuchResource as e:
cause = e.__cause__ if e.__cause__ is not None else e
errors += 1
logger.error(f"Cannot load schema file {e.ref}. {cause}")

return errors


def _get_version(contents: Mapping) -> int:
version = contents.get("version", 1)
if not isinstance(version, int):
# schema validation will complain about the type
try:
version = int(version)
except ValueError:
version = 1 # fallback to version 1
return version


# pylint: disable=broad-exception-caught
def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):
parser = TaxonomyParser(
schema_version=0, # Use version value in yaml
message_format=TaxonomyMessageFormat.LOGGING, # Report warnings and errors to the logger
yamllint_config=yaml_rules,
yamllint_strict=True, # Report yamllint warnings as errors
)
taxonomy = parser.parse(file_path)

if taxonomy.errors or taxonomy.warnings:
return None, taxonomy.warnings, taxonomy.errors

seed_instruction_data = []
warnings = 0
errors = 0
file_path = Path(file_path).resolve()
# file should end with ".yaml" explicitly
if file_path.suffix != ".yaml":
logger.warning(
f"Skipping {file_path}! Use lowercase '.yaml' extension instead."
)
warnings += 1
return None, warnings, errors
for i in range(len(file_path.parts) - 1, -1, -1):
if file_path.parts[i] in TAXONOMY_FOLDERS:
taxonomy_path = Path(*file_path.parts[i:])
break
else:
taxonomy_path = file_path
# read file if extension is correct
try:
with open(file_path, "r", encoding="utf-8") as file:
contents = yaml.safe_load(file)
if not contents:
logger.warning(f"Skipping {file_path} because it is empty!")
warnings += 1
return None, warnings, errors
if not isinstance(contents, Mapping):
logger.error(
f"{file_path} is not valid. The top-level element is not an object with key-value pairs."
)
errors += 1
return None, warnings, errors

# do general YAML linting if specified
version = _get_version(contents)
if version > 1: # no linting for version 1 yaml
if yaml_rules is not None:
is_file = os.path.isfile(yaml_rules)
if is_file:
logger.debug(f"Using YAML rules from {yaml_rules}")
yamllint_cmd = [
"yamllint",
"-f",
"parsable",
"-c",
yaml_rules,
file_path,
"-s",
]
else:
logger.debug(f"Cannot find {yaml_rules}. Using default rules.")
yamllint_cmd = [
"yamllint",
"-f",
"parsable",
"-d",
DEFAULT_YAML_RULES,
file_path,
"-s",
]
else:
yamllint_cmd = [
"yamllint",
"-f",
"parsable",
"-d",
DEFAULT_YAML_RULES,
file_path,
"-s",
]
try:
subprocess.check_output(yamllint_cmd, text=True)
except subprocess.SubprocessError as e:
lint_messages = [f"Problems found in file {file_path}"]
parsed_output = e.output.splitlines()
for p in parsed_output:
errors += 1
delim = str(file_path) + ":"
parsed_p = p.split(delim)[1]
lint_messages.append(parsed_p)
logger.error("\n".join(lint_messages))
return None, warnings, errors

validation_errors = _validate_yaml(contents, taxonomy_path)
if validation_errors:
errors += validation_errors
return None, warnings, errors

# get seed instruction data
tax_path = "->".join(taxonomy_path.parent.parts)
tax_path = "->".join(taxonomy.path.parent.parts)
contents = taxonomy.parsed
task_description = contents.get("task_description")
domain = contents.get("domain")
documents = contents.get("document")
if documents:
documents = _get_documents(source=documents)
logger.debug("Content from git repo fetched")

for seed_example in contents.get("seed_examples"):
for seed_example in contents.get("seed_examples", []):
question = seed_example.get("question")
answer = seed_example.get("answer")
context = seed_example.get("context", "")
Expand All @@ -357,10 +160,9 @@ def _read_taxonomy_file(file_path: str, yaml_rules: Optional[str] = None):
}
)
except Exception as e:
errors += 1
raise TaxonomyReadingException(f"Exception {e} raised in {file_path}") from e

return seed_instruction_data, warnings, errors
return seed_instruction_data, 0, 0


def read_taxonomy(taxonomy, taxonomy_base, yaml_rules):
Expand Down Expand Up @@ -440,7 +242,7 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
raise GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "icl_query_3" in samples[-1]:
Expand Down

0 comments on commit e475b9d

Please sign in to comment.