From fe891e0af8e8c69f3a1d77724a1b87db19b07a96 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Wed, 11 Dec 2024 13:21:05 -0500 Subject: [PATCH] remove __name__ from logging.getLogger() calls to use root logger Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/blocks/block.py | 2 +- src/instructlab/sdg/blocks/filterblock.py | 2 +- src/instructlab/sdg/blocks/iterblock.py | 2 +- src/instructlab/sdg/blocks/llmblock.py | 2 +- src/instructlab/sdg/blocks/utilblocks.py | 2 +- src/instructlab/sdg/checkpointing.py | 2 +- src/instructlab/sdg/datamixing.py | 7 ++----- src/instructlab/sdg/eval_data.py | 2 +- src/instructlab/sdg/generate_data.py | 5 ++--- src/instructlab/sdg/pipeline.py | 8 ++------ src/instructlab/sdg/registry.py | 2 +- src/instructlab/sdg/utils/chunkers.py | 2 +- src/instructlab/sdg/utils/model_formats.py | 2 +- src/instructlab/sdg/utils/taxonomy.py | 11 ++--------- 14 files changed, 18 insertions(+), 33 deletions(-) diff --git a/src/instructlab/sdg/blocks/block.py b/src/instructlab/sdg/blocks/block.py index bc955596..ba9ad45a 100644 --- a/src/instructlab/sdg/blocks/block.py +++ b/src/instructlab/sdg/blocks/block.py @@ -13,7 +13,7 @@ # Local from ..registry import BlockRegistry -logger = logging.getLogger(__name__) +logger = logging.getLogger() # This is part of the public API. diff --git a/src/instructlab/sdg/blocks/filterblock.py b/src/instructlab/sdg/blocks/filterblock.py index 181c973e..5afa7102 100644 --- a/src/instructlab/sdg/blocks/filterblock.py +++ b/src/instructlab/sdg/blocks/filterblock.py @@ -11,7 +11,7 @@ from ..registry import BlockRegistry from .block import Block -logger = logging.getLogger(__name__) +logger = logging.getLogger() # This is part of the public API. diff --git a/src/instructlab/sdg/blocks/iterblock.py b/src/instructlab/sdg/blocks/iterblock.py index b75b2ab4..1a6ac02d 100644 --- a/src/instructlab/sdg/blocks/iterblock.py +++ b/src/instructlab/sdg/blocks/iterblock.py @@ -11,7 +11,7 @@ from ..registry import BlockRegistry from .block import Block -logger = logging.getLogger(__name__) +logger = logging.getLogger() # This is part of the public API. diff --git a/src/instructlab/sdg/blocks/llmblock.py b/src/instructlab/sdg/blocks/llmblock.py index 89d9a27e..534aa035 100644 --- a/src/instructlab/sdg/blocks/llmblock.py +++ b/src/instructlab/sdg/blocks/llmblock.py @@ -17,7 +17,7 @@ from ..registry import BlockRegistry, PromptRegistry from .block import Block, BlockConfigParserError -logger = logging.getLogger(__name__) +logger = logging.getLogger() DEFAULT_MAX_NUM_TOKENS = 4096 diff --git a/src/instructlab/sdg/blocks/utilblocks.py b/src/instructlab/sdg/blocks/utilblocks.py index a03e6cb7..46139b99 100644 --- a/src/instructlab/sdg/blocks/utilblocks.py +++ b/src/instructlab/sdg/blocks/utilblocks.py @@ -13,7 +13,7 @@ from ..registry import BlockRegistry from .block import Block -logger = logging.getLogger(__name__) +logger = logging.getLogger() # This is part of the public API. diff --git a/src/instructlab/sdg/checkpointing.py b/src/instructlab/sdg/checkpointing.py index f04bc290..3dea7f8c 100644 --- a/src/instructlab/sdg/checkpointing.py +++ b/src/instructlab/sdg/checkpointing.py @@ -11,7 +11,7 @@ # First Party from instructlab.sdg.utils import pandas -logger = logging.getLogger(__name__) +logger = logging.getLogger() class Checkpointer: diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 25da098c..2821b373 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -22,7 +22,7 @@ # when |knowledge| << |skills| MIN_UPSAMPLE_THRESHOLD = 0.03 ALLOWED_COLS = ["id", "messages", "metadata"] -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() class DatasetListing(TypedDict): @@ -739,10 +739,7 @@ def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data): self.num_procs, ) - def generate(self, logger=None): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger + def generate(self): self._gen_mixed_data( self.knowledge_recipe, self.output_file_knowledge_recipe, diff --git a/src/instructlab/sdg/eval_data.py b/src/instructlab/sdg/eval_data.py index 9d9ae8f8..c07ea39b 100644 --- a/src/instructlab/sdg/eval_data.py +++ b/src/instructlab/sdg/eval_data.py @@ -13,7 +13,7 @@ # First Party from instructlab.sdg.pipeline import EVAL_PIPELINES_PKG, Pipeline -logger = logging.getLogger(__name__) +logger = logging.getLogger() def _extract_options(text: str) -> list[Any]: diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index ae5c6582..a640dbd3 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -340,7 +340,7 @@ def generate_data( document_output_dir = Path(output_dir) / f"documents-{date_suffix}" leaf_nodes = read_taxonomy_leaf_nodes( - taxonomy, taxonomy_base, yaml_rules, document_output_dir, logger=LOGGER + taxonomy, taxonomy_base, yaml_rules, document_output_dir ) if not leaf_nodes: raise GenerateException("Error: No new leaf nodes found in the taxonomy.") @@ -406,7 +406,6 @@ def generate_data( document_output_dir, model_name, docling_model_path=docling_model_path, - logger=LOGGER, ) if not samples: @@ -458,7 +457,7 @@ def generate_data( system_prompt, ) - mixer.generate(logger=LOGGER) + mixer.generate() generate_duration = time.time() - generate_start LOGGER.info(f"Generation took {generate_duration:.2f}s") diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 9a6e497c..63866b2b 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -23,7 +23,7 @@ from .blocks.block import Block from .registry import BlockRegistry -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() # This is part of the public API. @@ -134,16 +134,12 @@ def from_file(cls, ctx, pipeline_yaml): pipeline_yaml = os.path.join(resources.files(__package__), pipeline_yaml) return cls(ctx, pipeline_yaml, *_parse_pipeline_config_file(pipeline_yaml)) - def generate(self, dataset, checkpoint_name=None, logger=None) -> Dataset: + def generate(self, dataset, checkpoint_name=None) -> Dataset: """ Generate the dataset by running the pipeline steps. dataset: the input dataset checkpoint_name: unique subdir name for the checkpoint within checkpoint_dir """ - - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger # The checkpointer allows us to resume from where we left off # Saving the output of pipe instances along the way checkpoint_dir = None diff --git a/src/instructlab/sdg/registry.py b/src/instructlab/sdg/registry.py index e71e0172..e369e7a2 100644 --- a/src/instructlab/sdg/registry.py +++ b/src/instructlab/sdg/registry.py @@ -5,7 +5,7 @@ # Third Party from jinja2 import Environment, StrictUndefined, Template -logger = logging.getLogger(__name__) +logger = logging.getLogger() class BlockRegistry: diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index d8de36de..92a5937a 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -24,7 +24,7 @@ # First Party from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors -logger = logging.getLogger(__name__) +logger = logging.getLogger() _DEFAULT_CHUNK_OVERLAP = 100 diff --git a/src/instructlab/sdg/utils/model_formats.py b/src/instructlab/sdg/utils/model_formats.py index 54272888..b85ebb08 100644 --- a/src/instructlab/sdg/utils/model_formats.py +++ b/src/instructlab/sdg/utils/model_formats.py @@ -7,7 +7,7 @@ # Third Party from gguf.constants import GGUF_MAGIC -logger = logging.getLogger(__name__) +logger = logging.getLogger() def is_model_safetensors(model_path: pathlib.Path) -> bool: diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 88c575e2..15e69587 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -30,7 +30,7 @@ # Initialize the pdf parser PDFParser = pdf_parser_v1() -LOGGER = logging.getLogger(__name__) +LOGGER = logging.getLogger() def _is_taxonomy_file(fn: str) -> bool: @@ -372,11 +372,8 @@ def read_taxonomy( def read_taxonomy_leaf_nodes( - taxonomy, taxonomy_base, yaml_rules, document_output_dir=None, logger=None + taxonomy, taxonomy_base, yaml_rules, document_output_dir=None ): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger seed_instruction_data = read_taxonomy( taxonomy, taxonomy_base, yaml_rules, document_output_dir ) @@ -466,11 +463,7 @@ def leaf_node_to_samples( document_output_dir, model_name, docling_model_path=None, - logger=None, ): - if logger is not None: - global LOGGER # pylint: disable=global-statement - LOGGER = logger if not leaf_node: return [] if leaf_node[0].get("documents"):