diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dc3163e4..983a308b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -100,7 +100,7 @@ jobs: python -m pip install --upgrade pip python -m pip install tox tox-gh>=1.2 - - name: Run unit tests with tox + - name: Run unit and functional tests with tox run: | tox diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml index b0159b5f..c248cb45 100644 --- a/.markdownlint-cli2.yaml +++ b/.markdownlint-cli2.yaml @@ -13,3 +13,4 @@ ignores: - ".github/**" - "venv/**" - ".venv/**" + - "**/testdata/**" diff --git a/.spellcheck.yml b/.spellcheck.yml index 36dc7e40..f4edd131 100644 --- a/.spellcheck.yml +++ b/.spellcheck.yml @@ -8,7 +8,7 @@ matrix: camel-case: true mode: markdown sources: - - "**/*.md|!.tox/**|!venv/**" + - "**/*.md|!.tox/**|!venv/**|!**/testdata/**" dictionary: wordlists: - .spellcheck-en-custom.txt diff --git a/requirements.txt b/requirements.txt index 6222b1f8..7183da22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling>=2.3.0,<3.0.0 +docling>=2.4.2,<3.0.0 GitPython>=3.1.42,<4.0.0 httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 59a9b570..8831bde6 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -95,6 +95,8 @@ def __new__( doc_dict = cls._split_docs_by_filetype(documents, filepaths) if len(doc_dict.keys()) > 1: raise ValueError("Received multiple document types") + if len(doc_dict.keys()) < 1: + raise ValueError("Received no document types") if FileTypes.MD in doc_dict: doc_contents = [d for d, _ in doc_dict[FileTypes.MD]] diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py new file mode 100644 index 00000000..4fa83859 --- /dev/null +++ b/tests/functional/test_chunkers.py @@ -0,0 +1,56 @@ +# Standard +from pathlib import Path +import os + +# First Party +from instructlab.sdg.utils.chunkers import DocumentChunker + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + + +def test_chunk_pdf(tmp_path): + leaf_node = [ + { + "documents": ["Lorem ipsum"], + "filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))], + "taxonomy_path": "knowledge", + } + ] + chunker = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=tmp_path, + output_dir=tmp_path, + server_ctx_size=4096, + chunk_word_count=500, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) + chunks = chunker.chunk_documents() + assert len(chunks) > 9 + assert "Phoenix is a minor constellation" in chunks[0] + for chunk in chunks: + # inexact sanity-checking of chunk max length + assert len(chunk) < 2500 + + +def test_chunk_md(tmp_path): + markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md")) + leaf_node = [ + { + "documents": [markdown_path.read_text(encoding="utf-8")], + "filepaths": [markdown_path], + "taxonomy_path": "knowledge", + } + ] + chunker = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=tmp_path, + output_dir=tmp_path, + server_ctx_size=4096, + chunk_word_count=500, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) + chunks = chunker.chunk_documents() + assert len(chunks) > 7 + for chunk in chunks: + # inexact sanity-checking of chunk max length + assert len(chunk) < 2500 diff --git a/tests/functional/testdata/phoenix.md b/tests/functional/testdata/phoenix.md new file mode 100644 index 00000000..8ed90aea --- /dev/null +++ b/tests/functional/testdata/phoenix.md @@ -0,0 +1,284 @@ +# Phoenix (constellation) + +**Phoenix** is a minor [constellation](constellation "wikilink") in the +[southern sky](southern_sky "wikilink"). Named after the mythical +[phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a +celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603 +*[Uranometria](Uranometria "wikilink")*. The French explorer and +astronomer [Nicolas Louis de +Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter +stars and gave their [Bayer designations](Bayer_designation "wikilink") +in 1756. The constellation stretches from roughly −39 degrees to −57 degrees +[declination](declination "wikilink"), and from 23.5h to 2.5h of [right +ascension](right_ascension "wikilink"). The constellations Phoenix, +[Grus](Grus_(constellation) "wikilink"), +[Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"), +are known as the Southern Birds. + +The brightest star, [Alpha Phoenicis](Alpha_Phoenicis "wikilink"), is +named Ankaa, an [Arabic](Arabic "wikilink") word meaning 'the Phoenix'. +It is an orange giant of apparent magnitude 2.4. Next is [Beta +Phoenicis](Beta_Phoenicis "wikilink"), actually a +[binary](Binary_star "wikilink") system composed of two yellow giants +with a combined apparent magnitude of 3.3. [Nu +Phoenicis](Nu_Phoenicis "wikilink") has a dust disk, while the +constellation has ten star systems with known planets and the recently +discovered [galaxy clusters](galaxy_cluster "wikilink") [El +Gordo](El_Gordo_(galaxy_cluster) "wikilink") and the [Phoenix +Cluster](Phoenix_Cluster "wikilink")—located 7.2 and 5.7 billion light +years away respectively, two of the largest objects in the [visible +universe](visible_universe "wikilink"). Phoenix is the +[radiant](radiant_(meteor_shower) "wikilink") of two annual [meteor +showers](meteor_shower "wikilink"): the +[Phoenicids](Phoenicids "wikilink") in December, and the July +Phoenicids. + +## History + +Phoenix was the largest of the 12 constellations established by [Petrus +Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter +Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de +Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm +diameter celestial globe published in 1597 (or 1598) in Amsterdam by +Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first +depiction of this constellation in a celestial atlas was in [Johann +Bayer](Johann_Bayer "wikilink")'s +*[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included +it in his southern star catalog the same year under the Dutch name *Den +voghel Fenicx*, "The Bird Phoenix", symbolising the +[phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One +name of the brightest star [Alpha +Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic: +العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and +was coined sometime after 1800 in relation to the constellation. + +Celestial historian Richard Allen noted that unlike the other +constellations introduced by Plancius and [La +Caille](La_Caille "wikilink"), Phoenix has actual precedent in ancient +astronomy, as the Arabs saw this formation as representing young +ostriches, *Al Ri'āl*, or as a griffin or eagle. In addition, the +same group of stars was sometimes imagined by the Arabs as a boat, *Al +Zaurak*, on the nearby river Eridanus. He observed, "the introduction +of a Phoenix into modern astronomy was, in a measure, by adoption rather +than by invention." + +The Chinese incorporated Phoenix's brightest star, Ankaa (Alpha +Phoenicis), and stars from the adjacent constellation +[Sculptor](Sculptor_(constellation) "wikilink") to depict *Bakui*, a net +for catching birds. Phoenix and the neighbouring constellation of +[Grus](Grus_(constellation) "wikilink") together were seen by [Julius +Schiller](Julius_Schiller "wikilink") as portraying +[Aaron](Aaron "wikilink") the High Priest. These two constellations, +along with nearby [Pavo](Pavo_(constellation) "wikilink") and +[Tucana](Tucana "wikilink"), are called the Southern Birds. + +## Characteristics + +Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink") +and Sculptor to the north, Grus to the west, Tucana to the south, +touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and +[Eridanus](Eridanus_(constellation) "wikilink") to the east and +southeast. The bright star [Achernar](Achernar "wikilink") is +nearby. The three-letter abbreviation for the constellation, as +adopted by the [International Astronomical +Union](International_Astronomical_Union "wikilink") in 1922, is +"Phe". The official constellation boundaries, as set by Belgian +astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930, +are defined by a polygon of 10 segments. In the [equatorial coordinate +system](equatorial_coordinate_system "wikilink"), the [right +ascension](right_ascension "wikilink") coordinates of these borders lie +between 23h 26.5m and 02h 25.0m, +while the [declination](declination "wikilink") +coordinates are between −39.31° and −57.84°. This means it remains +below the horizon to anyone living north of the [40th +parallel](40th_parallel_north "wikilink") in the [Northern +Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky +for anyone living north of the [equator](equator "wikilink"). It is most +visible from locations such as Australia and South Africa during late +[Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most +of the constellation lies within, and can be located by, forming a +triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink") +and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre +of this. + +## Features + +### Stars + +A curved line of stars comprising Alpha, +[Kappa](Kappa_Phoenicis "wikilink"), [Mu](Mu_Phoenicis "wikilink"), +[Beta](Beta_Phoenicis "wikilink"), [Nu](Nu_Phoenicis "wikilink") and +[Gamma Phoenicis](Gamma_Phoenicis "wikilink") was seen as a boat by the +ancient Arabs. French explorer and astronomer [Nicolas Louis de +Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted and designated +27 stars with the [Bayer designations](Bayer_designation "wikilink") +Alpha through to Omega in 1756. Of these, he labelled two stars close +together Lambda, and assigned Omicron, Psi and Omega to three stars, +which subsequent astronomers such as [Benjamin +Gould](Benjamin_Apthorp_Gould "wikilink") felt were too dim to warrant +their letters. A different star was subsequently labelled Psi Phoenicis, +while the other two designations fell out of use. + +Ankaa is the brightest star in the constellation. It is an orange giant +of [apparent visual magnitude](apparent_visual_magnitude "wikilink") +2.37 and [spectral type](Stellar_classification "wikilink") +K0.5IIIb, 77 light years distant from Earth and orbited by a +secondary object about which little is known. Lying close by Ankaa +is [Kappa Phoenicis](Kappa_Phoenicis "wikilink"), a [main +sequence](main_sequence "wikilink") star of spectral type A5IVn and +apparent magnitude 3.90. Located centrally in the asterism, +[Beta Phoenicis](Beta_Phoenicis "wikilink") is the second brightest star +in the constellation and another [binary star](binary_star "wikilink"). +Together the stars, both yellow giants of spectral type G8, shine with +an apparent magnitude of 3.31, though the components are of individual +apparent magnitudes of 4.0 and 4.1 and orbit each other every 168 +years. [Zeta Phoenicis](Zeta_Phoenicis "wikilink") or *Wurren* +is an [Algol](Algol_variable "wikilink")-type [eclipsing +binary](Binary_star#Eclipsing_binaries "wikilink"), with an [apparent +magnitude](apparent_magnitude "wikilink") fluctuating between 3.9 and +4.4 with a period of around 1.7 days (40 hours); its dimming results +from the component two blue-white B-type stars, which orbit and block +out each other from Earth. The two stars are 0.05 AU from each other, +while a third star is around 600 AU away from the pair, and has an +orbital period exceeding 5000 years. The system is around 300 light +years distant. In 1976, researchers Clausen, Gyldenkerne, and +Grønbech calculated that a nearby 8th magnitude star is a fourth member +of the system. + +AI Phe is an eclipsing binary star identified in 1972. Its long mutual +eclipses and combination of spectroscopic and astrometric data allows +precise measurement of the masses and radii of the stars which is +viewed as a potential cross-check on stellar properties and distances +independent on Ceiphid Variables and such techniques. The long eclipse +events require space-based observations to avoid Solar interference. +Gamma Phoenicis is a [red giant](red_giant "wikilink") of spectral type +M0IIIa and varies between magnitudes 3.39 and 3.49. It lies 235 +light years away. [Psi Phoenicis](Psi_Phoenicis "wikilink") is +another red giant, this time of spectral type M4III, and has an +apparent magnitude that ranges between 4.3 and 4.5 over a period of +around 30 days. Lying 340 light years away, it has around 85 +times the diameter, but only 85% of the mass, of the Sun. [W +Phoenicis](W_Phoenicis "wikilink") is a [Mira +variable](Mira_variable "wikilink"), ranging from magnitude 8.1 to 14.4 +over 333.95 days. A red giant, its spectrum ranges between M5e and +M6e. Located 6.5 degrees west of Ankaa is [SX +Phoenicis](SX_Phoenicis "wikilink"), a variable star which ranges from +magnitude 7.1 to 7.5 over a period of a mere 79 minutes. Its spectral +type varies between A2 and F4. It gives its name to a group of stars +known as [SX Phoenicis variables](SX_Phoenicis_variable "wikilink"). +[Rho](Rho_Phoenicis "wikilink") and [BD +Phoenicis](BD_Phoenicis "wikilink") are [Delta Scuti +variables](Delta_Scuti_variable "wikilink")—short period (six hours at +most) pulsating stars that have been used as [standard +candles](Cosmic_distance_ladder#Standard_candles "wikilink") and as +subjects to study [astroseismology](astroseismology "wikilink"). Rho +is spectral type F2III, and ranges between magnitudes 5.20 and 5.26 +over a period of 2.85 hours. BD is of spectral type A1V, and +ranges between magnitudes 5.90 and 5.94. + +[Nu Phoenicis](Nu_Phoenicis "wikilink") is a yellow-white main sequence +star of spectral type F9V and magnitude 4.96. Lying some 49 light +years distant, it is around 1.2 times as massive as the Sun, and +likely to be surrounded by a disk of dust. It is the closest star in +the constellation that is visible with the unaided eye. [Gliese +915](Gliese_915 "wikilink") is a [white dwarf](white_dwarf "wikilink") +only 26 light years away. It is of magnitude 13.05, too faint to be seen +with the naked eye. White dwarfs are extremely dense stars compacted +into a volume the size of the Earth. With around 85% of the mass of +the Sun, Gliese 915 has a [surface gravity](surface_gravity "wikilink") +of 108.39 ± 0.01 (2.45 · 108) +[cm](centimetre "wikilink")·[s](second "wikilink")−2, or +approximately 250,000 of [Earth's](Earth's_gravity "wikilink"). + +Ten stars have been found to have planets to date, and four planetary +systems have been discovered with the [SuperWASP](SuperWASP "wikilink") +project. [HD 142](HD_142 "wikilink") is a yellow giant that has an +apparent magnitude of 5.7, and has a planet ([HD 142 +b](HD_142_b "wikilink")) 1.36 times the mass of Jupiter which orbits +every 328 days. [HD 2039](HD_2039 "wikilink") is a yellow subgiant +with an apparent magnitude of 9.0 around 330 light years away which has +a planet ([HD 2039 b](HD_2039_b "wikilink")) six times the mass of +Jupiter. [WASP-18](WASP-18 "wikilink") is a star of magnitude 9.29 which +was discovered to have a hot Jupiter-like planet +([WASP-18b](WASP-18b "wikilink")) taking less than a day to orbit the +star. The planet is suspected to be causing WASP-18 to appear older +than it really is. [WASP-4](WASP-4 "wikilink") and +[WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000 +light years distant and of 13th magnitude, each with a single planet +larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange +dwarf of spectral type K4V and visual magnitude 11.3, which has a +planetary companion of similar size and mass to Saturn. The planet +completes an orbit every 3.9 days. + +[WISE J003231.09-494651.4](List_of_brown_dwarfs "wikilink") and [WISE +J001505.87-461517.6](List_of_brown_dwarfs "wikilink") are two [brown +dwarfs](brown_dwarf "wikilink") discovered by the [Wide-field Infrared +Survey Explorer](Wide-field_Infrared_Survey_Explorer "wikilink"), and +are 63 and 49 light years away respectively. Initially hypothesised +before they were belatedly discovered, brown dwarfs are objects more +massive than planets, but which are of insufficient mass for [hydrogen +fusion](Nuclear_fusion "wikilink") characteristic of stars to occur. +Many are being found by sky surveys. + +Phoenix contains [HE0107-5240](HE0107-5240 "wikilink"), possibly one of +the oldest stars yet discovered. It has around 1/200,000 the +[metallicity](metallicity "wikilink") that the Sun has and hence must +have formed very early in the history of the universe. With a visual +magnitude of 15.17, it is around 10,000 times dimmer than the +faintest stars visible to the naked eye and is 36,000 light years +distant. + +### Deep-sky objects + +The constellation does not lie on the [galactic +plane](galactic_plane "wikilink") of the Milky Way, and there are no +prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf +[irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude +11.0 and lying some 12.7 million light years distant. Only 24000 light +years in diameter, it is an outlying member of the [Sculptor +Group](Sculptor_Group "wikilink"). NGC 625 is thought to have been +involved in a collision and is experiencing a burst of [active star +formation](Active_galactic_nucleus "wikilink"). [NGC +37](NGC_37 "wikilink") is a [lenticular +galaxy](lenticular_galaxy "wikilink") of apparent magnitude 14.66. It is +approximately 42 [kiloparsecs](kiloparsecs "wikilink") (137,000 +[light-years](light-years "wikilink")) in diameter and about 12.9 +billion years old. [Robert's Quartet](Robert's_Quartet "wikilink") +(composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three +spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink") +and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located +around 160 million light-years away which are in the process of +colliding and merging. They are within a circle of radius of 1.6 arcmin, +corresponding to about 75,000 light-years. Located in the galaxy ESO +243-49 is [HLX-1](HLX-1 "wikilink"), an [intermediate-mass black +hole](intermediate-mass_black_hole "wikilink")—the first one of its kind +identified. It is thought to be a remnant of a dwarf galaxy that was +absorbed in a [collision](Interacting_galaxy "wikilink") with ESO +243-49. Before its discovery, this class of black hole was only +hypothesized. + +Lying within the bounds of the constellation is the gigantic [Phoenix +cluster](Phoenix_cluster "wikilink"), which is around 7.3 million light +years wide and 5.7 billion light years away, making it one of the most +massive [galaxy clusters](galaxy_cluster "wikilink"). It was first +discovered in 2010, and the central galaxy is producing an estimated 740 +new stars a year. Larger still is [El +Gordo](El_Gordo_(galaxy_cluster) "wikilink"), or officially ACT-CL +J0102-4915, whose discovery was announced in 2012. Located around +7.2 billion light years away, it is composed of two subclusters in the +process of colliding, resulting in the spewing out of hot gas, seen in +X-rays and infrared images. + +### Meteor showers + +Phoenix is the [radiant](radiant_(meteor_shower) "wikilink") of two +annual [meteor showers](meteor_shower "wikilink"). The +[Phoenicids](Phoenicids "wikilink"), also known as the December +Phoenicids, were first observed on 3 December 1887. The shower was +particularly intense in December 1956, and is thought related to the +breakup of the [short-period comet](short-period_comet "wikilink") +[289P/Blanpain](289P/Blanpain "wikilink"). It peaks around 4–5 December, +though is not seen every year. A very minor meteor shower peaks +around July 14 with around one meteor an hour, though meteors can be +seen anytime from July 3 to 18; this shower is referred to as the July +Phoenicids. diff --git a/tests/functional/testdata/phoenix.pdf b/tests/functional/testdata/phoenix.pdf new file mode 100644 index 00000000..5430e78d Binary files /dev/null and b/tests/functional/testdata/phoenix.pdf differ diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 04970d24..0d327982 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -67,3 +67,22 @@ def test_chunker_factory_unsupported_filetype(documents_dir): output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) + + +def test_chunker_factory_empty_filetype(documents_dir): + """Test that the DocumentChunker factory class fails when provided no document""" + leaf_node = [ + { + "documents": [], + "taxonomy_path": "", + "filepaths": [], + } + ] + with pytest.raises(ValueError): + with tempfile.TemporaryDirectory() as temp_dir: + _ = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=documents_dir, + output_dir=temp_dir, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) diff --git a/tox.ini b/tox.ini index 1c6d2812..723212a9 100644 --- a/tox.ini +++ b/tox.ini @@ -3,11 +3,11 @@ [tox] # py3-unit runs unit tests with 'python3' # py311-unit runs the same tests with 'python3.11' -envlist = ruff, lint, mypy, spellcheck, py3-unit +envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional} minversion = 4.4 [testenv] -description = run tests (unit, unitcov) +description = run tests (unit, unitcov, functional) # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies # are huge. This reduces venv from 5.7 GB to 1.5 GB. setenv = @@ -16,8 +16,16 @@ package = wheel wheel_build_env = pkg deps = -r requirements-dev.txt commands = - unit: {envpython} -m pytest {posargs:tests} - unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"} + unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional} + unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"} + functional: {envpython} -m pytest {posargs:tests/functional} +allowlist_externals = + functional: ./scripts/functional-tests.sh + +[testenv:py3-functional] +setenv = + OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1} + OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY} # format, check, and linting targets don't build and install the project to # speed up testing. @@ -82,5 +90,5 @@ commands = [gh] python = - 3.11 = py311-unitcov - 3.10 = py310-unitcov + 3.11 = py311-{unitcov, functional} + 3.10 = py310-{unitcov, functional}