ai-train_platform/download_train.py

# import asyncio
# import os.path
# import shutil
# import sys
# import threading
# import time
# from pathlib import Path
#
# from hachoir.parser.image.iptc import datasets
#
# from middleware.minio_util import downFullPathFile
# from middleware.query_model import ModelConfigDAO
# import yaml
# import multiprocessing
# import torch
# from ultralytics import YOLO
#
#
# async def download_train(task_id: str, bz_training_task_id: int, pt_name: str):
#
#     DB_CONFIG = {
#         "dbname": "smart_dev_123",
#         "user": "postgres",
#         "password": "root",
#         "host": "8.137.54.85",
#         "port": "5060"
#     }
#
#     # 创建DAO实例
#     dao = ModelConfigDAO(DB_CONFIG)
#     time_ns=time.time_ns()
#     output_root=f"dataset-{time_ns}"
#     if not os.path.exists(output_root):
#         os.mkdir(output_root)
#     list_labels = dao.get_labels(bz_training_task_id)
#     list_datasets = dao.get_datasets(bz_training_task_id)
#     label_yaml_list = dao.get_label_yaml(bz_training_task_id)
#
#         # 定义数据结构（字典）
#     uavid_config = {
#
#         "path": "",  # 替换为你的绝对路径
#         "train": "images/train",  # 训练集路径
#         "val": "images/val",  # 验证集路径
#         "test": "images/test",  # 测试集路径（可选）
#         "names": {}
#     }
#     uavid_config["path"]=os.path.abspath(output_root)
#     for i,item in enumerate(label_yaml_list):
#         item.id_order=i
#         uavid_config["names"][f"{i}"]=item.e_name
#     # 生成 YAML 文件
#     data_yaml="data.yaml"
#     with open(data_yaml, "w", encoding="utf-8") as f:
#         yaml.dump(
#             uavid_config,
#             f,
#             default_flow_style=False,  # 禁用紧凑格式（保持多行）
#             allow_unicode=True,  # 允许 Unicode 字符
#             sort_keys=False  # 保持键的顺序
#         )
#     file_name = os.path.basename(data_yaml)
#     des_path = os.path.join(output_root, file_name)
#     if os.path.exists(des_path):
#         os.remove(des_path)
#     shutil.move(data_yaml, output_root)
#     print(f"output_rootoutput_rootoutput_rootoutput_root  {os.path.abspath(output_root)}")
#
#
#     print("YAML 文件已生成：uavid_config.yaml")
#
#     invalid_indices = []
#
#     for index, pic in enumerate(list_datasets):
#         if pic.resource_original_path:  # 图像路径有效
#             download_path = downFullPathFile(pic.resource_original_path)
#             if download_path:  # 下载成功
#                 pic.local_path = download_path
#
#                 pic.label_name = Path(download_path).stem  # 截取图片名称，用作标签
#             else:
#                 invalid_indices.append(index) #存储不符合条件的索引，准备删除
#         else:
#             invalid_indices.append(index)#存储不符合条件的索引，准备删除
#
#
#     # 从后往前删除（避免删除时索引错乱）,删除不符合条件的list_datasets
#
#     for idx in sorted(invalid_indices, reverse=True):
#         del list_datasets[idx]
#
#     for data_pic in list_datasets: #整理完整的图像与标签集的对应关系
#         for label in list_labels:
#             if data_pic.id == label.id:
#                 for item in label_yaml_list:
#                     if label.label_ids==item.id:
#                         data_pic.label_content=data_pic.label_content+item.id_order+" "+label.annotation_data+ '\n'
#
#
#     for data_pic in list_datasets:
#         label_txt = f"{data_pic.label_name}.txt"
#         with open(label_txt, 'w', encoding='utf-8') as f:
#             f.write(data_pic.label_content)
#         data_pic.label_txt_path=os.path.abspath(label_txt)
#     # 移动文件，制作数据集
#
#
#     dataset_dirs = {
#         "images": Path(output_root) / "images",
#         "labels": Path(output_root) / "labels"
#     }
#     for ds_dir in dataset_dirs.values():
#         (ds_dir / "val").mkdir(parents=True, exist_ok=True)
#         (ds_dir / "train").mkdir(parents=True, exist_ok=True)
#         (ds_dir / "test").mkdir(parents=True, exist_ok=True)
#
#     count_pic=0
#     for data_pic in list_datasets:
#         count_pic=count_pic+1
#         if count_pic%10<8:
#             images_train_path=dataset_dirs["images"]
#             image_dir=os.path.join(images_train_path,"train")
#             file_name=os.path.basename(data_pic.local_path)
#             des_path=os.path.join(image_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.local_path, image_dir)
#
#
#             labels_train_path=dataset_dirs["labels"]
#             label_dir=os.path.join(labels_train_path,"train")
#
#             file_name=os.path.basename(data_pic.label_txt_path)
#             des_path=os.path.join(label_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.label_txt_path, label_dir)
#         if count_pic%10==8:
#             images_val_path=dataset_dirs["images"]
#             image_dir=os.path.join(images_val_path,"val")
#             file_name=os.path.basename(data_pic.local_path)
#             des_path=os.path.join(image_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.local_path, image_dir)
#
#
#             labels_val_path=dataset_dirs["labels"]
#             label_dir=os.path.join(labels_val_path,"val")
#             file_name=os.path.basename(data_pic.label_txt_path)
#             des_path=os.path.join(label_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.label_txt_path, label_dir)
#
#         if count_pic%10==9:
#             images_test_path=dataset_dirs["images"]
#             image_dir=os.path.join(images_test_path,"test")
#             file_name=os.path.basename(data_pic.local_path)
#             des_path=os.path.join(image_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.local_path, image_dir)
#
#
#             labels_test_path=dataset_dirs["labels"]
#             label_dir=os.path.join(labels_test_path,"test")
#             file_name=os.path.basename(data_pic.label_txt_path)
#             des_path=os.path.join(label_dir,file_name)
#             if os.path.exists(des_path):
#                 os.remove(des_path)
#             shutil.move(data_pic.label_txt_path, label_dir)
#
#
#
#     custom_config = {
#         "epochs": 50,  # 快速测试用
#         "batch_size": 4,
#     }
#
#     # 启动后台训练
#     pid = await run_background_training(
#         dataset_dir=output_root,
#         weight_name=pt_name,
#         config_overrides=custom_config
#     )
#
#     print(f"pid--{pid}")
#     dao.insert_train_pid(task_id,train_pid=pid)
#
#
# def train_model(dataset_dir,weight_name="best_segmentation_model.pt", config_overrides=None):
#     """
#     训练模型并保存权重
#     :param weight_name: 自定义权重文件名（如 "uavid_seg_v1.pt"）
#     :param config_overrides: 覆盖默认配置的字典（可选）
#     """
#     # 合并配置（允许通过参数覆盖默认配置）
#
#     # 默认配置（可通过函数参数覆盖）
#     DEFAULT_CONFIG = {
#         "model": "pt/yolo11s-seg.pt",
#         "pretrained": True,
#         "data": os.path.join(dataset_dir, "data.yaml"),  # 关键修改：指向 data.yaml
#         "project": "UAVid_Segmentation",
#         "name": "v1.5_official",
#         "epochs": 1000,
#         "batch_size": 8,
#         "img_size": 640,
#         "workers": 4,
#         "optimizer": "SGD",
#         "lr0": 0.01,
#         "lrf": 0.01,
#         "momentum": 0.9,
#         "weight_decay": 0.0005,
#         "augment": True,
#         "hyp": {
#             "mosaic": 0.5,
#             "copy_paste": 0.2,
#             "mixup": 0.15,
#         },
#     }
#
#     config = DEFAULT_CONFIG.copy()
#     if config_overrides:
#         config.update(config_overrides)
#
#     # 初始化模型
#     model = YOLO(config["model"])
#
#     # 开始训练
#     results = model.train(
#         data=config["data"],
#         project=config["project"],
#         name=config["name"],
#         epochs=config["epochs"],
#         batch=config["batch_size"],
#         imgsz=config["img_size"],
#         workers=config["workers"],
#         optimizer=config["optimizer"],
#         lr0=config["lr0"],
#         lrf=config["lrf"],
#         momentum=config["momentum"],
#         weight_decay=config["weight_decay"],
#         augment=config["augment"],
#         device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     )
#
#     # 验证模型
#     metrics = model.val()
#     print(f"Validation mAP: {metrics.box_map:.2f} (box), {metrics.seg_map:.2f} (mask)")
#
#     # 保存最佳模型（使用自定义名称）
#     best_model = results.best_model
#     torch.save(best_model, weight_name)
#     print(f"Best model saved to: {weight_name}")
#
#
# # def run_background_training(output_root: str, weight_name="best_segmentation_model.pt", config_overrides=None):
# #     """使用 spawn 上下文启动进程"""
# #     ctx = multiprocessing.get_context('spawn')
# #     process = ctx.Process(
# #         target=train_model,
# #         args=(output_root, weight_name, config_overrides),
# #         daemon=False
# #     )
# #     process.start()
# #     return process.pid
#
# import asyncio
#
# async def run_background_training(dataset_dir, weight_name, config_overrides=None):
#     """异步启动训练进程"""
#     process = await asyncio.create_subprocess_exec(
#         sys.executable,
#         "train_worker.py",
#         "--dataset", dataset_dir,
#         "--weight", weight_name,
#         "--epochs", str(config_overrides.get("epochs", 50)),
#         "--batch", str(config_overrides.get("batch", 4)),
#         stdout=asyncio.subprocess.PIPE,
#         stderr=asyncio.subprocess.PIPE,
#     )
#     return process.pid


import asyncio
import os.path
import shutil
import sys
import threading
import time
import subprocess
import json
from pathlib import Path

from middleware.minio_util import downFullPathFile
from middleware.query_model import ModelConfigDAO
import yaml
import torch
from ultralytics import YOLO


async def download_train(task_id: str, bz_training_task_id: int, pt_name: str):
    """
    下载训练数据并启动训练
    这个函数负责准备数据，然后使用线程+subprocess创建独立进程执行训练
    """
    try:
        current_pid = os.getpid()
        print(f"Starting download and training for task {task_id} in process {current_pid}")

        DB_CONFIG = {
            "dbname": "smart_dev_123",
            "user": "postgres",
            "password": "root",
            "host": "8.137.54.85",
            "port": "5060"
        }

        # 创建DAO实例
        dao = ModelConfigDAO(DB_CONFIG)
        time_ns = time.time_ns()

        output_root = f"dataset/dataset-{time_ns}"

        try:
            if not os.path.exists(output_root):
                os.mkdir(output_root)
                print(f"Created output directory: {output_root}")
        except Exception as e:
            print(f"Failed to create output directory: {e}")
            raise

        try:
            # 获取标签和数据集信息
            list_labels = dao.get_labels(bz_training_task_id)
            list_datasets = dao.get_datasets(bz_training_task_id)
            label_yaml_list = dao.get_label_yaml(bz_training_task_id)

            print(
                f"Retrieved {len(list_labels)} labels, {len(list_datasets)} datasets, {len(label_yaml_list)} label configs")
        except Exception as e:
            print(f"Failed to retrieve data from database: {e}")
            raise

        # 定义数据结构（字典）
        uavid_config = {
            "path": "",  # 替换为你的绝对路径
            "train": "images/train",  # 训练集路径
            "val": "images/val",  # 验证集路径
            "test": "images/test",  # 测试集路径（可选）
            "names": {}
        }

        try:
            uavid_config["path"] = os.path.abspath(output_root)
            for i, item in enumerate(label_yaml_list):
                item.id_order = i
                uavid_config["names"][f"{i}"] = item.e_name

            # 生成 YAML 文件
            data_yaml = "data.yaml"
            with open(data_yaml, "w", encoding="utf-8") as f:
                yaml.dump(
                    uavid_config,
                    f,
                    default_flow_style=False,  # 禁用紧凑格式（保持多行）
                    allow_unicode=True,  # 允许 Unicode 字符
                    sort_keys=False  # 保持键的顺序
                )

            file_name = os.path.basename(data_yaml)
            des_path = os.path.join(output_root, file_name)
            if os.path.exists(des_path):
                os.remove(des_path)
            shutil.move(data_yaml, output_root)
            print(f"Generated YAML config at: {os.path.abspath(output_root)}")
        except Exception as e:
            print(f"Failed to generate YAML config: {e}")
            raise

        # 下载数据集
        invalid_indices = []

        try:
            for index, pic in enumerate(list_datasets):
                if hasattr(pic, 'resource_original_path') and pic.resource_original_path:  # 图像路径有效
                    try:
                        download_path = downFullPathFile(pic.resource_original_path)
                        if download_path:  # 下载成功
                            pic.local_path = download_path
                            pic.label_name = Path(download_path).stem  # 截取图片名称，用作标签
                            print(f"Downloaded file: {download_path}")
                        else:
                            invalid_indices.append(index)  # 存储不符合条件的索引，准备删除
                            print(f"Failed to download file: {pic.resource_original_path}")
                    except Exception as e:
                        invalid_indices.append(index)
                        print(f"Error downloading file {pic.resource_original_path}: {e}")
                else:
                    invalid_indices.append(index)  # 存储不符合条件的索引，准备删除
        except Exception as e:
            print(f"Error processing datasets: {e}")
            raise

        # 从后往前删除（避免删除时索引错乱）,删除不符合条件的list_datasets
        try:
            for idx in sorted(invalid_indices, reverse=True):
                del list_datasets[idx]

            print(f"Filtered datasets: {len(list_datasets)} valid items remaining")
        except Exception as e:
            print(f"Error filtering datasets: {e}")
            raise

        # 整理标签内容
        try:
            for data_pic in list_datasets:  # 整理完整的图像与标签集的对应关系
                for label in list_labels:
                    if hasattr(data_pic, 'id') and hasattr(label, 'id') and data_pic.id == label.id:
                        for item in label_yaml_list:
                            if hasattr(label, 'label_ids') and hasattr(item, 'id') and label.label_ids == item.id:
                                # 假设label有annotation_data属性
                                annotation = getattr(label, 'annotation_data', '')
                                current_content = getattr(data_pic, 'label_content', '')
                                data_pic.label_content = f"{current_content}{item.id_order} {annotation}\n"
        except Exception as e:
            print(f"Error organizing labels: {e}")
            raise

        # 创建标签文件
        try:
            for data_pic in list_datasets:
                if hasattr(data_pic, 'label_name'):
                    label_txt = f"{data_pic.label_name}.txt"
                    with open(label_txt, 'w', encoding='utf-8') as f:
                        f.write(getattr(data_pic, 'label_content', ''))
                    data_pic.label_txt_path = os.path.abspath(label_txt)
                    print(f"Created label file: {label_txt}")
        except Exception as e:
            print(f"Error creating label files: {e}")
            raise

        # 移动文件，制作数据集
        try:
            dataset_dirs = {
                "images": Path(output_root) / "images",
                "labels": Path(output_root) / "labels"
            }

            for ds_dir in dataset_dirs.values():
                (ds_dir / "val").mkdir(parents=True, exist_ok=True)
                (ds_dir / "train").mkdir(parents=True, exist_ok=True)
                (ds_dir / "test").mkdir(parents=True, exist_ok=True)

            print("Created dataset directory structure")
        except Exception as e:
            print(f"Error creating dataset directories: {e}")
            raise

        # 分配数据集到训练、验证、测试集
        try:
            count_pic = 0
            for data_pic in list_datasets:
                count_pic += 1

                # 80% 训练集, 10% 验证集, 10% 测试集
                if count_pic % 10 < 8:
                    split = "train"
                elif count_pic % 10 == 8:
                    split = "val"
                else:  # count_pic % 10 == 9
                    split = "test"

                # 移动图像文件
                if hasattr(data_pic, 'local_path') and os.path.exists(data_pic.local_path):
                    images_path = dataset_dirs["images"]
                    image_dir = os.path.join(images_path, split)
                    file_name = os.path.basename(data_pic.local_path)
                    des_path = os.path.join(image_dir, file_name)

                    if os.path.exists(des_path):
                        os.remove(des_path)
                    shutil.move(data_pic.local_path, image_dir)

                # 移动标签文件
                if hasattr(data_pic, 'label_txt_path') and os.path.exists(data_pic.label_txt_path):
                    labels_path = dataset_dirs["labels"]
                    label_dir = os.path.join(labels_path, split)
                    file_name = os.path.basename(data_pic.label_txt_path)
                    des_path = os.path.join(label_dir, file_name)

                    if os.path.exists(des_path):
                        os.remove(des_path)
                    shutil.move(data_pic.label_txt_path, label_dir)

            print(f"Organized {count_pic} files into dataset splits")
        except Exception as e:
            print(f"Error organizing dataset splits: {e}")
            raise

        # 训练配置
        custom_config = {
            "epochs": 50,  # 快速测试用
            "batch_size": 4,
            "workers": 0,  # 禁用多进程数据加载
        }

        # 保存训练配置到文件
        config_file = f"train_config_{task_id}.json"
        with open(config_file, 'w', encoding='utf-8') as f:
            json.dump({
                'dataset_dir': output_root,
                'pt_name': pt_name,
                'config_overrides': custom_config,
                'db_config': DB_CONFIG,
                'task_id': task_id
            }, f)

        print(f"Training data preparation completed for task {task_id}")

        # 在Windows上使用线程+subprocess创建训练进程
        # 避免使用asyncio.create_subprocess_exec
        loop = asyncio.get_event_loop()
        training_pid = await loop.run_in_executor(
            None,  # 使用默认的线程池
            start_training_process,
            config_file
        )

        if training_pid:
            print(f"pid--{training_pid}")
            dao.insert_train_pid(task_id, train_pid=training_pid)
            return training_pid
        else:
            raise Exception("Failed to start training process")

    except Exception as e:
        print(f"Training failed for task {task_id}: {e}", exc_info=True)
        raise


def start_training_process(config_file: str) -> int:
    """
    在独立线程中启动训练进程
    使用subprocess.Popen创建训练进程
    """
    try:
        # 创建训练脚本内容
        train_script = '''
import sys
import json
import os
import torch
from ultralytics import YOLO

class MockModelConfigDAO:
    def __init__(self, db_config):
        self.db_config = db_config

    def insert_train_pid(self, task_id, train_pid):
        print(f"Inserted training PID {train_pid} for task {task_id}")

def train_model(dataset_dir, weight_name="best_segmentation_model.pt", config_overrides=None):
    """
    训练模型并保存权重
    """
    try:
        current_pid = os.getpid()
        print(f"Starting model training in process {current_pid} with dataset: {dataset_dir}")

        # 默认配置（可通过参数覆盖）
        DEFAULT_CONFIG = {
            "model": "pt/yolo11s-seg.pt",
            "pretrained": True,
            "data": os.path.join(dataset_dir, "data.yaml"),
            "project": "UAVid_Segmentation",
            "name": "v1.5_official",
            "epochs": 1000,
            "batch_size": 8,
            "img_size": 640,
            "workers": 0,  # 禁用多进程数据加载
            "optimizer": "SGD",
            "lr0": 0.01,
            "lrf": 0.01,
            "momentum": 0.9,
            "weight_decay": 0.0005,
            "augment": True,
            "hyp": {
                "mosaic": 0.5,
                "copy_paste": 0.2,
                "mixup": 0.15,
            },
        }

        config = DEFAULT_CONFIG.copy()
        if config_overrides:
            config.update(config_overrides)

        print(f"Training config: {config}")

        # 检查数据配置文件
        data_path = config["data"]
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Data configuration file not found: {data_path}")

        # 初始化模型
        model = YOLO(config["model"])
        print(f"Model initialized with: {config["model"]}")

        # 开始训练
        results = model.train(
            data=config["data"],
            project=config["project"],
            name=config["name"],
            epochs=config["epochs"],
            batch=config["batch_size"],
            imgsz=config["img_size"],
            workers=config["workers"],
            optimizer=config["optimizer"],
            lr0=config["lr0"],
            lrf=config["lrf"],
            momentum=config["momentum"],
            weight_decay=config["weight_decay"],
            augment=config["augment"],
            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        )
        print(f"Training completed successfully in process {current_pid}")

        # 验证模型
        metrics = model.val()
        print(f"Validation mAP: {metrics.box:.2f} (box), {metrics.seg:.2f} (mask)")

        # 保存最佳模型
        try:
            if hasattr(results, 'best') and results.best:
                best_model_path = results.best
                if os.path.exists(best_model_path):
                    import shutil
                    shutil.copy2(best_model_path, weight_name)
                    print(f"Best model saved to: {os.path.abspath(weight_name)}")
                else:
                    torch.save(model.state_dict(), weight_name)
                    print(f"Best model path not found, saved state dict to: {weight_name}")
            else:
                torch.save(model.state_dict(), weight_name)
                print(f"Saved model state dict to: {weight_name}")
        except Exception as e:
            print(f"Warning: Failed to save best model: {e}")
            torch.save(model.state_dict(), weight_name)
            print(f"Fallback: Saved model state dict to: {weight_name}")

        return True

    except Exception as e:
        print(f"Model training failed in process {os.getpid()}: {e}", exc_info=True)
        raise

def main():
    if len(sys.argv) != 2:
        print("Usage: python -c '<script>' <config_file>")
        sys.exit(1)

    config_file = sys.argv[1]

    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            config = json.load(f)

        # 提取配置
        dataset_dir = config['dataset_dir']
        pt_name = config['pt_name']
        config_overrides = config['config_overrides']
        db_config = config['db_config']
        task_id = config['task_id']

        # 获取当前进程ID
        pid = os.getpid()
        print(f"Training process started for task {task_id} with PID {pid}")

        # 记录PID到数据库
        try:
            from middleware.query_model import ModelConfigDAO
            dao = ModelConfigDAO(db_config)
        except ImportError:
            dao = MockModelConfigDAO(db_config)

        dao.insert_train_pid(task_id, train_pid=pid)

        # 执行训练
        success = train_model(dataset_dir, pt_name, config_overrides)

        if success:
            print(f"Training completed successfully for task {task_id}")
            sys.exit(0)
        else:
            print(f"Training failed for task {task_id}")
            sys.exit(1)

    except Exception as e:
        print(f"Training error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()
'''

        # 保存训练脚本
        script_path = f"train_worker_{os.path.basename(config_file).split('_')[2].split('.')[0]}.py"
        with open(script_path, 'w', encoding='utf-8') as f:
            f.write(train_script)

        # 使用subprocess.Popen创建训练进程
        # 在Windows上使用shell=True可以解决一些路径问题
        process = subprocess.Popen([
            sys.executable, script_path, config_file
        ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=False)

        print(f"Started training process with PID {process.pid}")

        # 启动线程来处理输出
        threading.Thread(target=handle_process_output, args=(process,), daemon=True).start()

        return process.pid

    except Exception as e:
        print(f"Failed to start training process: {e}", exc_info=True)
        return 0


def handle_process_output(process: subprocess.Popen):
    """
    处理子进程的输出
    """
    try:
        # 分别读取stdout和stderr
        def read_stream(stream, stream_name):
            while True:
                line = stream.readline()
                if not line:
                    break
                line = line.strip()
                print(f"[{stream_name}] {line}")

        # 启动线程读取stdout和stderr
        stdout_thread = threading.Thread(target=read_stream, args=(process.stdout, 'STDOUT'))
        stderr_thread = threading.Thread(target=read_stream, args=(process.stderr, 'STDERR'))

        stdout_thread.start()
        stderr_thread.start()

        # 等待进程完成
        stdout_thread.join()
        stderr_thread.join()

        # 获取返回码
        return_code = process.wait()
        print(f"Training process completed with return code: {return_code}")

    except Exception as e:
        print(f"Error handling process output: {e}", exc_info=True)


if __name__ == "__main__":
    """
    直接运行时的入口点
    用于测试
    """
    import sys

    if len(sys.argv) < 4:
        print(f"Usage: {sys.argv[0]} <task_id> <train_task_id> <pt_name>")
        sys.exit(1)

    task_id = sys.argv[1]
    train_task_id = int(sys.argv[2])
    pt_name = sys.argv[3]

    try:
        # 创建事件循环
        loop = asyncio.get_event_loop()
        pid = loop.run_until_complete(download_train(task_id, train_task_id, pt_name))
        print(f"Training started in process {pid}")

        # 保持事件循环运行
        try:
            loop.run_forever()
        except KeyboardInterrupt:
            print("Received keyboard interrupt. Exiting...")
        finally:
            loop.close()

    except Exception as e:
        print(f"Training failed: {e}", exc_info=True)
        sys.exit(1)