first commit

This commit is contained in:
闫旭隆
2025-09-25 10:33:37 +08:00
commit 34839c2654
387 changed files with 149159 additions and 0 deletions

View File

@ -0,0 +1,22 @@
from dataclasses import dataclass
@dataclass
class ProcessingConfig:
"""Configuration for text processing pipeline."""
model_path: str
data_directory: str
filename_pattern: str
batch_size_triple: int = 16
batch_size_concept: int = 64
output_directory: str = "./generation_result_debug"
total_shards_triple: int = 1
current_shard_triple: int = 0
total_shards_concept: int = 1
current_shard_concept: int = 0
use_8bit: bool = False
debug_mode: bool = False
resume_from: int = 0
record : bool = False
max_new_tokens: int = 8192
max_workers: int = 8
remove_doc_spaces: bool = False