@@ -1955,8 +1955,8 @@ <h1 id="basic-classification">Basic Classification<a class="headerlink" href="#b
1955
1955
< p > When classifying documents, the process involves extracting the content of the document and adding it to the prompt with several possible classifications. ExtractThinker simplifies this process using Pydantic models and instructor.</ p >
1956
1956
< h2 id ="simple-classification "> Simple Classification< a class ="headerlink " href ="#simple-classification " title ="Permanent link "> ¶</ a > </ h2 >
1957
1957
< p > The most straightforward way to classify documents:</ p >
1958
- < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-0-1 " name ="__codelineno-0-1 " href ="#__codelineno-0-1 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> extract_thinker</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> Classification</ span > < span class ="p "> ,</ span > < span class ="n "> Extractor</ span >
1959
- < a id ="__codelineno-0-2 " name ="__codelineno-0-2 " href ="#__codelineno-0-2 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> extract_thinker.document_loader</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> DocumentLoaderTesseract</ span >
1958
+ < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-0-1 " name ="__codelineno-0-1 " href ="#__codelineno-0-1 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> extract_thinker</ span > < span class ="kn "> import</ span > < span class ="n "> Classification</ span > < span class ="p "> ,</ span > < span class ="n "> Extractor</ span >
1959
+ < a id ="__codelineno-0-2 " name ="__codelineno-0-2 " href ="#__codelineno-0-2 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> extract_thinker.document_loader</ span > < span class ="kn "> import</ span > < span class ="n "> DocumentLoaderTesseract</ span >
1960
1960
< a id ="__codelineno-0-3 " name ="__codelineno-0-3 " href ="#__codelineno-0-3 "> </ a >
1961
1961
< a id ="__codelineno-0-4 " name ="__codelineno-0-4 " href ="#__codelineno-0-4 "> </ a > < span class ="c1 "> # Define classifications</ span >
1962
1962
< a id ="__codelineno-0-5 " name ="__codelineno-0-5 " href ="#__codelineno-0-5 "> </ a > < span class ="n "> classifications</ span > < span class ="o "> =</ span > < span class ="p "> [</ span >
@@ -1984,27 +1984,27 @@ <h2 id="simple-classification">Simple Classification<a class="headerlink" href="
1984
1984
</ code > </ pre > </ div >
1985
1985
< h2 id ="type-mapping-with-contract "> Type Mapping with Contract< a class ="headerlink " href ="#type-mapping-with-contract " title ="Permanent link "> ¶</ a > </ h2 >
1986
1986
< p > Adding contract structure to the classification improves accuracy:</ p >
1987
- < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-1-1 " name ="__codelineno-1-1 " href ="#__codelineno-1-1 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> typing</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> List</ span >
1988
- < a id ="__codelineno-1-2 " name ="__codelineno-1-2 " href ="#__codelineno-1-2 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> extract_thinker.models.contract</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> Contract</ span >
1987
+ < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-1-1 " name ="__codelineno-1-1 " href ="#__codelineno-1-1 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> typing</ span > < span class ="kn "> import</ span > < span class ="n "> List</ span >
1988
+ < a id ="__codelineno-1-2 " name ="__codelineno-1-2 " href ="#__codelineno-1-2 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> extract_thinker.models.contract</ span > < span class ="kn "> import</ span > < span class ="n "> Contract</ span >
1989
1989
< a id ="__codelineno-1-3 " name ="__codelineno-1-3 " href ="#__codelineno-1-3 "> </ a >
1990
- < a id ="__codelineno-1-4 " name ="__codelineno-1-4 " href ="#__codelineno-1-4 "> </ a > < span class ="k "> class</ span > < span class =" w " > </ span > < span class ="nc "> InvoiceContract</ span > < span class ="p "> (</ span > < span class ="n "> Contract</ span > < span class ="p "> ):</ span >
1990
+ < a id ="__codelineno-1-4 " name ="__codelineno-1-4 " href ="#__codelineno-1-4 "> </ a > < span class ="k "> class</ span > < span class ="nc "> InvoiceContract</ span > < span class ="p "> (</ span > < span class ="n "> Contract</ span > < span class ="p "> ):</ span >
1991
1991
< a id ="__codelineno-1-5 " name ="__codelineno-1-5 " href ="#__codelineno-1-5 "> </ a > < span class ="n "> invoice_number</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span >
1992
1992
< a id ="__codelineno-1-6 " name ="__codelineno-1-6 " href ="#__codelineno-1-6 "> </ a > < span class ="n "> invoice_date</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span >
1993
1993
< a id ="__codelineno-1-7 " name ="__codelineno-1-7 " href ="#__codelineno-1-7 "> </ a > < span class ="n "> lines</ span > < span class ="p "> :</ span > < span class ="n "> List</ span > < span class ="p "> [</ span > < span class ="n "> LineItem</ span > < span class ="p "> ]</ span >
1994
1994
< a id ="__codelineno-1-8 " name ="__codelineno-1-8 " href ="#__codelineno-1-8 "> </ a > < span class ="n "> total_amount</ span > < span class ="p "> :</ span > < span class ="nb "> float</ span >
1995
1995
< a id ="__codelineno-1-9 " name ="__codelineno-1-9 " href ="#__codelineno-1-9 "> </ a >
1996
- < a id ="__codelineno-1-10 " name ="__codelineno-1-10 " href ="#__codelineno-1-10 "> </ a > < span class ="k "> class</ span > < span class =" w " > </ span > < span class ="nc "> DriverLicense</ span > < span class ="p "> (</ span > < span class ="n "> Contract</ span > < span class ="p "> ):</ span >
1996
+ < a id ="__codelineno-1-10 " name ="__codelineno-1-10 " href ="#__codelineno-1-10 "> </ a > < span class ="k "> class</ span > < span class ="nc "> DriverLicense</ span > < span class ="p "> (</ span > < span class ="n "> Contract</ span > < span class ="p "> ):</ span >
1997
1997
< a id ="__codelineno-1-11 " name ="__codelineno-1-11 " href ="#__codelineno-1-11 "> </ a > < span class ="n "> name</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span >
1998
1998
< a id ="__codelineno-1-12 " name ="__codelineno-1-12 " href ="#__codelineno-1-12 "> </ a > < span class ="n "> age</ span > < span class ="p "> :</ span > < span class ="nb "> int</ span >
1999
1999
< a id ="__codelineno-1-13 " name ="__codelineno-1-13 " href ="#__codelineno-1-13 "> </ a > < span class ="n "> license_number</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span >
2000
2000
</ code > </ pre > </ div >
2001
2001
< p > The contract structure is automatically added to the prompt, helping the model understand the expected document structure.</ p >
2002
2002
< h2 id ="classification-response "> Classification Response< a class ="headerlink " href ="#classification-response " title ="Permanent link "> ¶</ a > </ h2 >
2003
2003
< p > All classifications return a standardized response:</ p >
2004
- < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-2-1 " name ="__codelineno-2-1 " href ="#__codelineno-2-1 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> typing</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> Optional</ span >
2005
- < a id ="__codelineno-2-2 " name ="__codelineno-2-2 " href ="#__codelineno-2-2 "> </ a > < span class ="kn "> from</ span > < span class =" w " > </ span > < span class ="nn "> pydantic</ span > < span class =" w " > </ span > < span class ="kn "> import</ span > < span class ="n "> BaseModel</ span > < span class ="p "> ,</ span > < span class ="n "> Field</ span >
2004
+ < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-2-1 " name ="__codelineno-2-1 " href ="#__codelineno-2-1 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> typing</ span > < span class ="kn "> import</ span > < span class ="n "> Optional</ span >
2005
+ < a id ="__codelineno-2-2 " name ="__codelineno-2-2 " href ="#__codelineno-2-2 "> </ a > < span class ="kn "> from</ span > < span class ="nn "> pydantic</ span > < span class ="kn "> import</ span > < span class ="n "> BaseModel</ span > < span class ="p "> ,</ span > < span class ="n "> Field</ span >
2006
2006
< a id ="__codelineno-2-3 " name ="__codelineno-2-3 " href ="#__codelineno-2-3 "> </ a >
2007
- < a id ="__codelineno-2-4 " name ="__codelineno-2-4 " href ="#__codelineno-2-4 "> </ a > < span class ="k "> class</ span > < span class =" w " > </ span > < span class ="nc "> ClassificationResponse</ span > < span class ="p "> (</ span > < span class ="n "> BaseModel</ span > < span class ="p "> ):</ span >
2007
+ < a id ="__codelineno-2-4 " name ="__codelineno-2-4 " href ="#__codelineno-2-4 "> </ a > < span class ="k "> class</ span > < span class ="nc "> ClassificationResponse</ span > < span class ="p "> (</ span > < span class ="n "> BaseModel</ span > < span class ="p "> ):</ span >
2008
2008
< a id ="__codelineno-2-5 " name ="__codelineno-2-5 " href ="#__codelineno-2-5 "> </ a > < span class ="n "> name</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span >
2009
2009
< a id ="__codelineno-2-6 " name ="__codelineno-2-6 " href ="#__codelineno-2-6 "> </ a > < span class ="n "> confidence</ span > < span class ="p "> :</ span > < span class ="n "> Optional</ span > < span class ="p "> [</ span > < span class ="nb "> int</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="n "> Field</ span > < span class ="p "> (</ span >
2010
2010
< a id ="__codelineno-2-7 " name ="__codelineno-2-7 " href ="#__codelineno-2-7 "> </ a > < span class ="n "> description</ span > < span class ="o "> =</ span > < span class ="s2 "> "From 1 to 10. 10 being the highest confidence"</ span > < span class ="p "> ,</ span >
0 commit comments