-
Notifications
You must be signed in to change notification settings - Fork 46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix markdown reader image path #334
base: feature
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,15 +37,18 @@ def __init__( | |
f"PaiMarkdownReader created with enable_table_summary : {self.enable_table_summary}" | ||
) | ||
|
||
def replace_image_paths(self, markdown_name: str, content: str): | ||
def replace_image_paths(self, markdown_dir: str, markdown_name: str, content: str): | ||
markdown_image_matches = MARKDOWN_IMAGE_PATTERN.finditer(content) | ||
html_image_matches = HTML_IMAGE_PATTERN.finditer(content) | ||
for match in markdown_image_matches: | ||
full_match = match.group(0) # 整个匹配 | ||
local_url = match.group(1) # 捕获的URL | ||
image_name = os.path.basename(local_url) | ||
|
||
local_path = os.path.join(markdown_dir, image_name) | ||
|
||
if self._oss_cache: | ||
oss_url = self._transform_local_to_oss(markdown_name, local_url) | ||
oss_url = self._transform_local_to_oss(markdown_name, local_path) | ||
if oss_url: | ||
content = content.replace(local_url, oss_url) | ||
else: | ||
|
@@ -55,9 +58,12 @@ def replace_image_paths(self, markdown_name: str, content: str): | |
for match in html_image_matches: | ||
full_match = match.group(0) # 整个匹配 | ||
local_url = match.group(1) # 捕获的URL | ||
image_name = os.path.basename(local_url) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. image有可能会有上层目录,比如"figures/docs/1.jpg" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 建议这样写 def is_url(url):
"""判断是否为 URL"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
base_dir = os.path.basedir(markdown_path)
if not is_url(image_path):
image_path = os.path.join(base_dir, image_path) #绝对路径不会被合并 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. image在上传的时候,上层目录没有被保留 |
||
|
||
local_path = os.path.join(markdown_dir, image_name) | ||
|
||
if self._oss_cache: | ||
oss_url = self._transform_local_to_oss(markdown_name, local_url) | ||
oss_url = self._transform_local_to_oss(markdown_name, local_path) | ||
if oss_url: | ||
content = content.replace(local_url, oss_url) | ||
else: | ||
|
@@ -78,6 +84,7 @@ def _transform_local_to_oss(self, markdown_name: str, local_url: str): | |
def parse_markdown(self, markdown_path): | ||
markdown_name = os.path.basename(markdown_path).split(".")[0] | ||
markdown_name = markdown_name.replace(" ", "_") | ||
markdown_dir = os.path.dirname(markdown_path) | ||
text = "" | ||
pre_line = "" | ||
with open(markdown_path) as fp: | ||
|
@@ -108,7 +115,7 @@ def parse_markdown(self, markdown_path): | |
line = fp.readline() | ||
|
||
text += pre_line | ||
md_content = self.replace_image_paths(markdown_name, text) | ||
md_content = self.replace_image_paths(markdown_dir, markdown_name, text) | ||
return md_content | ||
|
||
def load_data( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
突然想到,html文件里可能也要做这个处理