refactor: 整理代码格式并修复部分类型和依赖问题

1. 调整task.py的TypeVar导入和默认值
2. 格式化多处列表和参数写法,统一括号风格
3. 为pdftool.py添加pyrefly忽略注释修复类型警告
4. 为emlmanager.py添加数据库连接断言和检查
5. 修正hfdownload.py的depends_on参数为元组格式
This commit is contained in:
2026-06-26 21:52:44 +08:00
parent fd282db28f
commit 7ded8df05e
6 changed files with 112 additions and 98 deletions
+24 -4
View File
@@ -88,6 +88,8 @@ class EmailDatabase:
def insert_email(self, email_data: dict[str, Any]) -> bool: def insert_email(self, email_data: dict[str, Any]) -> bool:
"""插入邮件数据.""" """插入邮件数据."""
assert self.conn, "数据库连接未初始化"
try: try:
with self._lock: with self._lock:
cursor = self.conn.cursor() cursor = self.conn.cursor()
@@ -123,6 +125,8 @@ class EmailDatabase:
self, keyword: str = "", field: str = "all", limit: int = 100, offset: int = 0 self, keyword: str = "", field: str = "all", limit: int = 100, offset: int = 0
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
"""搜索邮件.""" """搜索邮件."""
assert self.conn, "数据库连接未初始化"
with self._lock: with self._lock:
cursor = self.conn.cursor() cursor = self.conn.cursor()
@@ -154,6 +158,8 @@ class EmailDatabase:
def get_grouped_emails(self) -> dict[str, list[dict[str, Any]]]: def get_grouped_emails(self) -> dict[str, list[dict[str, Any]]]:
"""获取按主题分组的邮件.""" """获取按主题分组的邮件."""
assert self.conn, "数据库连接未初始化"
with self._lock: with self._lock:
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute(f"SELECT * FROM {TABLE_NAME} ORDER BY subject, date_parsed DESC") cursor.execute(f"SELECT * FROM {TABLE_NAME} ORDER BY subject, date_parsed DESC")
@@ -183,6 +189,8 @@ class EmailDatabase:
def get_email_count(self) -> int: def get_email_count(self) -> int:
"""获取邮件总数.""" """获取邮件总数."""
assert self.conn, "数据库连接未初始化"
with self._lock: with self._lock:
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}") cursor.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}")
@@ -190,6 +198,8 @@ class EmailDatabase:
def clear_all(self) -> None: def clear_all(self) -> None:
"""清空所有邮件数据.""" """清空所有邮件数据."""
assert self.conn, "数据库连接未初始化"
with self._lock: with self._lock:
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute(f"DELETE FROM {TABLE_NAME}") cursor.execute(f"DELETE FROM {TABLE_NAME}")
@@ -557,15 +567,13 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
emails = self.db.search_emails(keyword, field, limit, offset) emails = self.db.search_emails(keyword, field, limit, offset)
total_count = self.db.get_email_count() total_count = self.db.get_email_count()
self._send_json_response( self._send_json_response({
{
"emails": emails, "emails": emails,
"count": len(emails), "count": len(emails),
"total": total_count, "total": total_count,
"limit": limit, "limit": limit,
"offset": offset, "offset": offset,
} })
)
def _api_get_email(self, query_params: dict[str, list[str]]) -> None: def _api_get_email(self, query_params: dict[str, list[str]]) -> None:
"""API: 获取单个邮件详情.""" """API: 获取单个邮件详情."""
@@ -578,6 +586,10 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
self._send_json_response({"error": "缺少邮件ID"}, 400) self._send_json_response({"error": "缺少邮件ID"}, 400)
return return
if not self.db.conn:
self._send_json_response({"error": "数据库连接未初始化"}, 500)
return
with self.db._lock: with self.db._lock:
cursor = self.db.conn.cursor() cursor = self.db.conn.cursor()
cursor.execute(f"SELECT * FROM {TABLE_NAME} WHERE id = ?", (int(email_id),)) cursor.execute(f"SELECT * FROM {TABLE_NAME} WHERE id = ?", (int(email_id),))
@@ -630,6 +642,10 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
if not eml_files: if not eml_files:
return return
if not self.db.conn:
self._send_json_response({"error": "数据库连接未初始化"}, 500)
return
# 先批量查询所有已存在的文件 # 先批量查询所有已存在的文件
with self.db._lock: with self.db._lock:
cursor = self.db.conn.cursor() cursor = self.db.conn.cursor()
@@ -1268,6 +1284,10 @@ def main() -> None:
if eml_files: if eml_files:
print(f"发现 {len(eml_files)} 个 EML 文件,开始导入...") print(f"发现 {len(eml_files)} 个 EML 文件,开始导入...")
if not EmlManagerHandler.db.conn:
print("数据库连接未初始化,无法导入邮件")
return
# 先批量查询所有已存在的文件 # 先批量查询所有已存在的文件
with EmlManagerHandler.db._lock: with EmlManagerHandler.db._lock:
cursor = EmlManagerHandler.db.conn.cursor() cursor = EmlManagerHandler.db.conn.cursor()
+7 -7
View File
@@ -39,7 +39,7 @@ RUSTUP_MIRRORS: dict[str, dict[str, str]] = {
UsableRustVersion = Literal["stable", "nightly", "beta"] UsableRustVersion = Literal["stable", "nightly", "beta"]
UsableMirror = Literal["aliyun", "ustc", "tsinghua"] UsableMirror = Literal["aliyun", "ustc", "tsinghua"]
DEFAULT_RUST_VERSION: str = "stable" DEFAULT_RUST_VERSION: UsableRustVersion = "stable"
DEFAULT_MIRROR: UsableMirror = "tsinghua" DEFAULT_MIRROR: UsableMirror = "tsinghua"
@@ -136,13 +136,13 @@ def main() -> None:
args = parser.parse_args() args = parser.parse_args()
if args.command == "mirror": if args.command == "mirror":
graph = px.Graph.from_specs( graph = px.Graph.from_specs([
[px.TaskSpec("set_rust_mirror", fn=set_rust_mirror, args=(args.name,), verbose=True)] px.TaskSpec("set_rust_mirror", fn=set_rust_mirror, args=(args.name,), verbose=True)
) ])
elif args.command == "install": elif args.command == "install":
graph = px.Graph.from_specs( graph = px.Graph.from_specs([
[px.TaskSpec("install_rust", cmd=["rustup", "toolchain", "install", args.version], verbose=True)] px.TaskSpec("install_rust", cmd=["rustup", "toolchain", "install", args.version], verbose=True)
) ])
else: else:
parser.print_help() parser.print_help()
return return
+18 -28
View File
@@ -33,20 +33,16 @@ def init_sub_dirs() -> None:
sub_dirs = [subdir for subdir in Path.cwd().iterdir() if subdir.is_dir()] sub_dirs = [subdir for subdir in Path.cwd().iterdir() if subdir.is_dir()]
for subdir in sub_dirs: for subdir in sub_dirs:
px.run( px.run(
px.Graph.from_specs( px.Graph.from_specs([
[
px.TaskSpec( px.TaskSpec(
"init", "init",
cmd=["git", "init"], cmd=["git", "init"],
conditions=[not_has_git_repo], conditions=(not_has_git_repo,),
cwd=str(subdir), cwd=subdir,
),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=["init"], cwd=str(subdir)),
px.TaskSpec(
"commit", cmd=["git", "commit", "-m", "init commit"], depends_on=["add"], cwd=str(subdir)
),
]
), ),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=("init",)),
px.TaskSpec("commit", cmd=["git", "commit", "-m", "init commit"], depends_on=("add",)),
]),
) )
@@ -73,29 +69,23 @@ def main() -> None:
description="Gittool - Git 执行工具.", description="Gittool - Git 执行工具.",
graphs={ graphs={
# 添加并提交 # 添加并提交
"a": px.Graph.from_specs( "a": px.Graph.from_specs([
[ px.TaskSpec("add", cmd=["git", "add", "."], conditions=(has_files,)),
px.TaskSpec("add", cmd=["git", "add", "."], conditions=[has_files]), px.TaskSpec("commit", cmd=["git", "commit", "-m", "chore: update"], depends_on=("add",)),
px.TaskSpec("commit", cmd=["git", "commit", "-m", "chore: update"], depends_on=["add"]), ]),
]
),
# 清理 # 清理
"c": px.Graph.from_specs( "c": px.Graph.from_specs([
[
px.TaskSpec("clean", cmd=["git", "clean", "-xfd", *EXCLUDE_CMDS]), px.TaskSpec("clean", cmd=["git", "clean", "-xfd", *EXCLUDE_CMDS]),
px.TaskSpec("status", cmd=["git", "status", "--porcelain"], depends_on=["clean"]), px.TaskSpec("status", cmd=["git", "status", "--porcelain"], depends_on=("clean",)),
] ]),
),
# 初始化、添加并提交 # 初始化、添加并提交
"i": px.Graph.from_specs( "i": px.Graph.from_specs([
[ px.TaskSpec("init", cmd=["git", "init"], conditions=(not_has_git_repo,)),
px.TaskSpec("init", cmd=["git", "init"], conditions=[not_has_git_repo]), px.TaskSpec("add", cmd=["git", "add", "."], depends_on=("init",), conditions=(has_files,)),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=["init"], conditions=[has_files]),
px.TaskSpec( px.TaskSpec(
"commit", cmd=["git", "commit", "-m", "init commit"], depends_on=["add"], conditions=[has_files] "commit", cmd=["git", "commit", "-m", "init commit"], depends_on=("add",), conditions=(has_files,)
),
]
), ),
]),
# 初始化子目录 # 初始化子目录
"isub": px.Graph.from_specs([isub]), "isub": px.Graph.from_specs([isub]),
# 推送 # 推送
+8 -12
View File
@@ -37,32 +37,29 @@ def main():
download_dir.mkdir(parents=True, exist_ok=True) download_dir.mkdir(parents=True, exist_ok=True)
if args.use_hfd: if args.use_hfd:
graph = px.Graph.from_specs( graph = px.Graph.from_specs([
[
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True), px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec( px.TaskSpec(
name="download_hfd", name="download_hfd",
cmd=["wget", "https://hf-mirror.com/hfd/hfd.sh"], cmd=["wget", "https://hf-mirror.com/hfd/hfd.sh"],
depends_on=["setenvs"], depends_on=("setenvs",),
verbose=True, verbose=True,
), ),
px.TaskSpec( px.TaskSpec(
name="chmod_hfd", name="chmod_hfd",
cmd=["chmod", "a+x", "hfd.sh"], cmd=["chmod", "a+x", "hfd.sh"],
depends_on=["download_hfd"], depends_on=("download_hfd",),
verbose=True, verbose=True,
), ),
px.TaskSpec( px.TaskSpec(
name="run_hfd", name="run_hfd",
cmd=["./hfd.sh", dataset_name, args.type], cmd=["./hfd.sh", dataset_name, args.type],
depends_on=["chmod_hfd"], depends_on=("chmod_hfd",),
verbose=True, verbose=True,
), ),
] ])
)
else: else:
graph = px.Graph.from_specs( graph = px.Graph.from_specs([
[
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True), px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec( px.TaskSpec(
name="download", name="download",
@@ -77,10 +74,9 @@ def main():
"--local-dir", "--local-dir",
str(Path.cwd() / dataset_name), str(Path.cwd() / dataset_name),
], ],
depends_on=["setenvs"], depends_on=("setenvs",),
verbose=True, verbose=True,
), ),
] ])
)
px.run(graph, strategy="thread", verbose=True) px.run(graph, strategy="thread", verbose=True)
+8 -1
View File
@@ -146,7 +146,7 @@ def pdf_extract_text(input_path: Path, output_path: Path) -> None:
doc = fitz.open(str(input_path)) doc = fitz.open(str(input_path))
text = "" text = ""
for page in doc: for page in doc:
text += page.get_text() + "\n\n" text += str(page.get_text()) + "\n\n"
doc.close() doc.close()
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -164,6 +164,7 @@ def pdf_extract_images(input_path: Path, output_dir: Path) -> None:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
image_count = 0 image_count = 0
# pyrefly: ignore [bad-argument-type]
for page_num, page in enumerate(doc): for page_num, page in enumerate(doc):
images = page.get_images(full=True) images = page.get_images(full=True)
for img_idx, img in enumerate(images): for img_idx, img in enumerate(images):
@@ -249,9 +250,13 @@ def pdf_info(input_path: Path) -> None:
doc = fitz.open(str(input_path)) doc = fitz.open(str(input_path))
print(f"文件: {input_path}") print(f"文件: {input_path}")
print(f"页数: {doc.page_count}") print(f"页数: {doc.page_count}")
# pyrefly: ignore [missing-attribute]
print(f"标题: {doc.metadata.get('title', 'N/A')}") print(f"标题: {doc.metadata.get('title', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"作者: {doc.metadata.get('author', 'N/A')}") print(f"作者: {doc.metadata.get('author', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"创建日期: {doc.metadata.get('creationDate', 'N/A')}") print(f"创建日期: {doc.metadata.get('creationDate', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"修改日期: {doc.metadata.get('modDate', 'N/A')}") print(f"修改日期: {doc.metadata.get('modDate', 'N/A')}")
print(f"文件大小: {input_path.stat().st_size / 1024:.1f} KB") print(f"文件大小: {input_path.stat().st_size / 1024:.1f} KB")
doc.close() doc.close()
@@ -281,6 +286,7 @@ def pdf_ocr(input_path: Path, output_path: Path, lang: str = "chi_sim+eng") -> N
new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height) new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
new_page.insert_image(new_page.rect, pixmap=pix) new_page.insert_image(new_page.rect, pixmap=pix)
text_rect = fitz.Rect(0, 0, page.rect.width, page.rect.height) text_rect = fitz.Rect(0, 0, page.rect.width, page.rect.height)
# pyrefly: ignore [bad-argument-type]
new_page.insert_textbox(text_rect, ocr_text) new_page.insert_textbox(text_rect, ocr_text)
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -319,6 +325,7 @@ def pdf_to_images(input_path: Path, output_dir: Path, dpi: int = 300) -> None:
doc = fitz.open(str(input_path)) doc = fitz.open(str(input_path))
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# pyrefly: ignore [bad-argument-type]
for page_num, page in enumerate(doc): for page_num, page in enumerate(doc):
pix = page.get_pixmap(dpi=dpi) pix = page.get_pixmap(dpi=dpi)
image_path = output_dir / f"{input_path.stem}_page_{page_num + 1}.png" image_path = output_dir / f"{input_path.stem}_page_{page_num + 1}.png"
+3 -2
View File
@@ -28,12 +28,13 @@ from typing import (
Mapping, Mapping,
Optional, Optional,
Tuple, Tuple,
TypeVar,
Union, Union,
cast, cast,
) )
T = TypeVar("T") from typing_extensions import TypeVar
T = TypeVar("T", default=Any)
# 任务可调用对象可以是同步或异步的。显式保留联合类型,让 mypy 理解两种形态。 # 任务可调用对象可以是同步或异步的。显式保留联合类型,让 mypy 理解两种形态。
TaskFn = Union[ TaskFn = Union[