refactor: 整理代码格式并修复部分类型和依赖问题

1. 调整task.py的TypeVar导入和默认值
2. 格式化多处列表和参数写法,统一括号风格
3. 为pdftool.py添加pyrefly忽略注释修复类型警告
4. 为emlmanager.py添加数据库连接断言和检查
5. 修正hfdownload.py的depends_on参数为元组格式
This commit is contained in:
2026-06-26 21:52:44 +08:00
parent fd282db28f
commit 7ded8df05e
6 changed files with 112 additions and 98 deletions
+29 -9
View File
@@ -88,6 +88,8 @@ class EmailDatabase:
def insert_email(self, email_data: dict[str, Any]) -> bool:
"""插入邮件数据."""
assert self.conn, "数据库连接未初始化"
try:
with self._lock:
cursor = self.conn.cursor()
@@ -123,6 +125,8 @@ class EmailDatabase:
self, keyword: str = "", field: str = "all", limit: int = 100, offset: int = 0
) -> list[dict[str, Any]]:
"""搜索邮件."""
assert self.conn, "数据库连接未初始化"
with self._lock:
cursor = self.conn.cursor()
@@ -154,6 +158,8 @@ class EmailDatabase:
def get_grouped_emails(self) -> dict[str, list[dict[str, Any]]]:
"""获取按主题分组的邮件."""
assert self.conn, "数据库连接未初始化"
with self._lock:
cursor = self.conn.cursor()
cursor.execute(f"SELECT * FROM {TABLE_NAME} ORDER BY subject, date_parsed DESC")
@@ -183,6 +189,8 @@ class EmailDatabase:
def get_email_count(self) -> int:
"""获取邮件总数."""
assert self.conn, "数据库连接未初始化"
with self._lock:
cursor = self.conn.cursor()
cursor.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}")
@@ -190,6 +198,8 @@ class EmailDatabase:
def clear_all(self) -> None:
"""清空所有邮件数据."""
assert self.conn, "数据库连接未初始化"
with self._lock:
cursor = self.conn.cursor()
cursor.execute(f"DELETE FROM {TABLE_NAME}")
@@ -557,15 +567,13 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
emails = self.db.search_emails(keyword, field, limit, offset)
total_count = self.db.get_email_count()
self._send_json_response(
{
"emails": emails,
"count": len(emails),
"total": total_count,
"limit": limit,
"offset": offset,
}
)
self._send_json_response({
"emails": emails,
"count": len(emails),
"total": total_count,
"limit": limit,
"offset": offset,
})
def _api_get_email(self, query_params: dict[str, list[str]]) -> None:
"""API: 获取单个邮件详情."""
@@ -578,6 +586,10 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
self._send_json_response({"error": "缺少邮件ID"}, 400)
return
if not self.db.conn:
self._send_json_response({"error": "数据库连接未初始化"}, 500)
return
with self.db._lock:
cursor = self.db.conn.cursor()
cursor.execute(f"SELECT * FROM {TABLE_NAME} WHERE id = ?", (int(email_id),))
@@ -630,6 +642,10 @@ class EmlManagerHandler(BaseHTTPRequestHandler):
if not eml_files:
return
if not self.db.conn:
self._send_json_response({"error": "数据库连接未初始化"}, 500)
return
# 先批量查询所有已存在的文件
with self.db._lock:
cursor = self.db.conn.cursor()
@@ -1268,6 +1284,10 @@ def main() -> None:
if eml_files:
print(f"发现 {len(eml_files)} 个 EML 文件,开始导入...")
if not EmlManagerHandler.db.conn:
print("数据库连接未初始化,无法导入邮件")
return
# 先批量查询所有已存在的文件
with EmlManagerHandler.db._lock:
cursor = EmlManagerHandler.db.conn.cursor()
+7 -7
View File
@@ -39,7 +39,7 @@ RUSTUP_MIRRORS: dict[str, dict[str, str]] = {
UsableRustVersion = Literal["stable", "nightly", "beta"]
UsableMirror = Literal["aliyun", "ustc", "tsinghua"]
DEFAULT_RUST_VERSION: str = "stable"
DEFAULT_RUST_VERSION: UsableRustVersion = "stable"
DEFAULT_MIRROR: UsableMirror = "tsinghua"
@@ -136,13 +136,13 @@ def main() -> None:
args = parser.parse_args()
if args.command == "mirror":
graph = px.Graph.from_specs(
[px.TaskSpec("set_rust_mirror", fn=set_rust_mirror, args=(args.name,), verbose=True)]
)
graph = px.Graph.from_specs([
px.TaskSpec("set_rust_mirror", fn=set_rust_mirror, args=(args.name,), verbose=True)
])
elif args.command == "install":
graph = px.Graph.from_specs(
[px.TaskSpec("install_rust", cmd=["rustup", "toolchain", "install", args.version], verbose=True)]
)
graph = px.Graph.from_specs([
px.TaskSpec("install_rust", cmd=["rustup", "toolchain", "install", args.version], verbose=True)
])
else:
parser.print_help()
return
+25 -35
View File
@@ -33,20 +33,16 @@ def init_sub_dirs() -> None:
sub_dirs = [subdir for subdir in Path.cwd().iterdir() if subdir.is_dir()]
for subdir in sub_dirs:
px.run(
px.Graph.from_specs(
[
px.TaskSpec(
"init",
cmd=["git", "init"],
conditions=[not_has_git_repo],
cwd=str(subdir),
),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=["init"], cwd=str(subdir)),
px.TaskSpec(
"commit", cmd=["git", "commit", "-m", "init commit"], depends_on=["add"], cwd=str(subdir)
),
]
),
px.Graph.from_specs([
px.TaskSpec(
"init",
cmd=["git", "init"],
conditions=(not_has_git_repo,),
cwd=subdir,
),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=("init",)),
px.TaskSpec("commit", cmd=["git", "commit", "-m", "init commit"], depends_on=("add",)),
]),
)
@@ -73,29 +69,23 @@ def main() -> None:
description="Gittool - Git 执行工具.",
graphs={
# 添加并提交
"a": px.Graph.from_specs(
[
px.TaskSpec("add", cmd=["git", "add", "."], conditions=[has_files]),
px.TaskSpec("commit", cmd=["git", "commit", "-m", "chore: update"], depends_on=["add"]),
]
),
"a": px.Graph.from_specs([
px.TaskSpec("add", cmd=["git", "add", "."], conditions=(has_files,)),
px.TaskSpec("commit", cmd=["git", "commit", "-m", "chore: update"], depends_on=("add",)),
]),
# 清理
"c": px.Graph.from_specs(
[
px.TaskSpec("clean", cmd=["git", "clean", "-xfd", *EXCLUDE_CMDS]),
px.TaskSpec("status", cmd=["git", "status", "--porcelain"], depends_on=["clean"]),
]
),
"c": px.Graph.from_specs([
px.TaskSpec("clean", cmd=["git", "clean", "-xfd", *EXCLUDE_CMDS]),
px.TaskSpec("status", cmd=["git", "status", "--porcelain"], depends_on=("clean",)),
]),
# 初始化、添加并提交
"i": px.Graph.from_specs(
[
px.TaskSpec("init", cmd=["git", "init"], conditions=[not_has_git_repo]),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=["init"], conditions=[has_files]),
px.TaskSpec(
"commit", cmd=["git", "commit", "-m", "init commit"], depends_on=["add"], conditions=[has_files]
),
]
),
"i": px.Graph.from_specs([
px.TaskSpec("init", cmd=["git", "init"], conditions=(not_has_git_repo,)),
px.TaskSpec("add", cmd=["git", "add", "."], depends_on=("init",), conditions=(has_files,)),
px.TaskSpec(
"commit", cmd=["git", "commit", "-m", "init commit"], depends_on=("add",), conditions=(has_files,)
),
]),
# 初始化子目录
"isub": px.Graph.from_specs([isub]),
# 推送
+40 -44
View File
@@ -37,50 +37,46 @@ def main():
download_dir.mkdir(parents=True, exist_ok=True)
if args.use_hfd:
graph = px.Graph.from_specs(
[
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec(
name="download_hfd",
cmd=["wget", "https://hf-mirror.com/hfd/hfd.sh"],
depends_on=["setenvs"],
verbose=True,
),
px.TaskSpec(
name="chmod_hfd",
cmd=["chmod", "a+x", "hfd.sh"],
depends_on=["download_hfd"],
verbose=True,
),
px.TaskSpec(
name="run_hfd",
cmd=["./hfd.sh", dataset_name, args.type],
depends_on=["chmod_hfd"],
verbose=True,
),
]
)
graph = px.Graph.from_specs([
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec(
name="download_hfd",
cmd=["wget", "https://hf-mirror.com/hfd/hfd.sh"],
depends_on=("setenvs",),
verbose=True,
),
px.TaskSpec(
name="chmod_hfd",
cmd=["chmod", "a+x", "hfd.sh"],
depends_on=("download_hfd",),
verbose=True,
),
px.TaskSpec(
name="run_hfd",
cmd=["./hfd.sh", dataset_name, args.type],
depends_on=("chmod_hfd",),
verbose=True,
),
])
else:
graph = px.Graph.from_specs(
[
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec(
name="download",
cmd=[
"uvx",
"hf",
"download",
"--repo-type",
args.type,
"--force-download",
dataset_name,
"--local-dir",
str(Path.cwd() / dataset_name),
],
depends_on=["setenvs"],
verbose=True,
),
]
)
graph = px.Graph.from_specs([
px.TaskSpec(name="setenvs", fn=setenvs, verbose=True),
px.TaskSpec(
name="download",
cmd=[
"uvx",
"hf",
"download",
"--repo-type",
args.type,
"--force-download",
dataset_name,
"--local-dir",
str(Path.cwd() / dataset_name),
],
depends_on=("setenvs",),
verbose=True,
),
])
px.run(graph, strategy="thread", verbose=True)
+8 -1
View File
@@ -146,7 +146,7 @@ def pdf_extract_text(input_path: Path, output_path: Path) -> None:
doc = fitz.open(str(input_path))
text = ""
for page in doc:
text += page.get_text() + "\n\n"
text += str(page.get_text()) + "\n\n"
doc.close()
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -164,6 +164,7 @@ def pdf_extract_images(input_path: Path, output_dir: Path) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
image_count = 0
# pyrefly: ignore [bad-argument-type]
for page_num, page in enumerate(doc):
images = page.get_images(full=True)
for img_idx, img in enumerate(images):
@@ -249,9 +250,13 @@ def pdf_info(input_path: Path) -> None:
doc = fitz.open(str(input_path))
print(f"文件: {input_path}")
print(f"页数: {doc.page_count}")
# pyrefly: ignore [missing-attribute]
print(f"标题: {doc.metadata.get('title', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"作者: {doc.metadata.get('author', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"创建日期: {doc.metadata.get('creationDate', 'N/A')}")
# pyrefly: ignore [missing-attribute]
print(f"修改日期: {doc.metadata.get('modDate', 'N/A')}")
print(f"文件大小: {input_path.stat().st_size / 1024:.1f} KB")
doc.close()
@@ -281,6 +286,7 @@ def pdf_ocr(input_path: Path, output_path: Path, lang: str = "chi_sim+eng") -> N
new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
new_page.insert_image(new_page.rect, pixmap=pix)
text_rect = fitz.Rect(0, 0, page.rect.width, page.rect.height)
# pyrefly: ignore [bad-argument-type]
new_page.insert_textbox(text_rect, ocr_text)
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -319,6 +325,7 @@ def pdf_to_images(input_path: Path, output_dir: Path, dpi: int = 300) -> None:
doc = fitz.open(str(input_path))
output_dir.mkdir(parents=True, exist_ok=True)
# pyrefly: ignore [bad-argument-type]
for page_num, page in enumerate(doc):
pix = page.get_pixmap(dpi=dpi)
image_path = output_dir / f"{input_path.stem}_page_{page_num + 1}.png"
+3 -2
View File
@@ -28,12 +28,13 @@ from typing import (
Mapping,
Optional,
Tuple,
TypeVar,
Union,
cast,
)
T = TypeVar("T")
from typing_extensions import TypeVar
T = TypeVar("T", default=Any)
# 任务可调用对象可以是同步或异步的。显式保留联合类型,让 mypy 理解两种形态。
TaskFn = Union[