-
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* refactor * refactor * refactor * refactor * checkpoint * type cleanup * checkpoint * checkpoint * checkpoint * checkpoint * scrapy from master * fix: lib errors * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint * checkpoint
- Loading branch information
1 parent
40f4ab0
commit 2b06762
Showing
25 changed files
with
279 additions
and
216 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,80 @@ | ||
# pyright: reportUnknownMemberType=false | ||
|
||
from scrapy.selector.unified import Selector | ||
from scrapy.http.response.xml import XmlResponse | ||
|
||
from typing import Any, Optional | ||
from typing import Any, Optional, cast, Protocol | ||
from toolz.functoolz import curry, flip, pipe # type: ignore | ||
|
||
from public_law.exceptions import ParseException | ||
from public_law.selector_util import xpath_get | ||
from public_law.text import NonemptyString, URL, titleize | ||
import public_law.text as text | ||
from public_law.items.crs import Article, Division, Title | ||
from public_law.parsers.usa.colorado.crs_articles import parse_articles | ||
from public_law.parsers.usa.colorado.crs_divisions import parse_divisions | ||
|
||
split = curry(flip(str.split)) | ||
xpath_get = curry(xpath_get) | ||
|
||
def second(x: list[Any]) -> Any: | ||
return x[1] | ||
|
||
class Logger(Protocol): | ||
def warn(self, message: str) -> None: ... | ||
|
||
|
||
def parse_title_bang(dom: XmlResponse, logger: Any) -> Title: | ||
|
||
def parse_title_bang(dom: XmlResponse, logger: Logger) -> Title: | ||
match parse_title(dom, logger): | ||
case None: | ||
raise Exception("Could not parse title") | ||
case title: | ||
return title | ||
|
||
|
||
def parse_title(dom: XmlResponse, logger: Any) -> Optional[Title]: | ||
match(dom.xpath("//TITLE-TEXT/text()").get()): | ||
case str(raw_name): | ||
name = NonemptyString(titleize(raw_name)) | ||
case None: | ||
logger.warn(f"Could not the parse title name in {dom.url}") | ||
return None | ||
def parse_title(dom: XmlResponse, logger: Logger) -> Optional[Title]: | ||
try: | ||
name = string_pipe( | ||
"//TITLE-TEXT/text()", | ||
xpath_get(dom), | ||
titleize | ||
) | ||
number = string_pipe( | ||
"//TITLE-NUM/text()", | ||
xpath_get(dom), | ||
text.split_on_space, | ||
second | ||
) | ||
children = _parse_divisions_or_articles(number, dom, logger) | ||
url = source_url(number) | ||
return Title(name, number, children, url) | ||
|
||
match(dom.xpath("//TITLE-NUM/text()").get()): | ||
case str(raw_number): | ||
number = NonemptyString(raw_number.split(" ")[1]) | ||
case None: | ||
logger.warn(f"Could not the parse title number in {dom.url}") | ||
return None | ||
except ParseException as e: | ||
logger.warn(f"Could not parse the title: {e}") | ||
return None | ||
|
||
match _parse_divisions_or_articles(number, dom, logger): | ||
case None: | ||
return None | ||
case children: | ||
url_number = number.rjust(2, "0") | ||
source_url = URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf") | ||
return Title( | ||
name = name, | ||
number = number, | ||
source_url = URL(source_url), | ||
children = children | ||
) | ||
|
||
|
||
def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Any) -> Optional[list[Division] | list[Article]]: | ||
|
||
def string_pipe(*args: Any) -> NonemptyString: | ||
"""A wrapper around pipe() that casts the result to a NonemptyString.""" | ||
args_with_string: Any = args + (NonemptyString,) | ||
|
||
return cast(NonemptyString, pipe(*args_with_string)) | ||
|
||
|
||
def _parse_divisions_or_articles(title_number: NonemptyString, dom: Selector | XmlResponse, logger: Logger) -> list[Division] | list[Article]: | ||
division_nodes = dom.xpath("//T-DIV") | ||
article_nodes = dom.xpath("//TA-LIST") | ||
|
||
if len(division_nodes) > 0: | ||
func = parse_divisions | ||
parse_fun = parse_divisions | ||
elif len(article_nodes) > 0: | ||
func = parse_articles | ||
parse_fun = parse_articles | ||
else: | ||
msg = f"Could not parse divisions or articles in Title {title_number}. Neither T-DIV nor TA-LIST nodes were found." | ||
logger.warn(msg) | ||
return None | ||
msg = f"Neither T-DIV nor TA-LIST nodes were found in Title {title_number}." | ||
raise ParseException(msg) | ||
|
||
return parse_fun(title_number, dom, logger) | ||
|
||
|
||
return func(title_number, dom, logger) | ||
def source_url(title_number: NonemptyString) -> URL: | ||
url_number = title_number.rjust(2, "0") | ||
return URL(f"https://leg.colorado.gov/sites/default/files/images/olls/crs2022-title-{url_number}.pdf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.