diff --git a/docs/swift_train_and_infer.md b/docs/swift_train_and_infer.md
index 1e74607..693efe6 100644
--- a/docs/swift_train_and_infer.md
+++ b/docs/swift_train_and_infer.md
@@ -21,20 +21,20 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type minicpm-v-v2_5-chat
 
 2. You can also run the code with more arguments below to run the inference:
 ``` 
-model_id_or_path # 可以写huggingface的模型id或者本地模型地址
-infer_backend ['AUTO', 'vllm', 'pt'] # 后段推理，默认auto
-dtype ['bf16', 'fp16', 'fp32', 'AUTO'] # 计算精度
-max_length # 最大长度
-max_new_tokens: int = 2048 #最多生成多少token
-do_sample: bool = True # 是否采样
-temperature: float = 0.3 # 生成时的温度系数
+model_id_or_path  # Can be the model ID from Hugging Face or the local path to the model
+infer_backend ['AUTO', 'vllm', 'pt']  # Backend for inference, default is auto
+dtype ['bf16', 'fp16', 'fp32', 'AUTO']  # Computational precision
+max_length  # Maximum length
+max_new_tokens: int = 2048  # Maximum number of tokens to generate
+do_sample: bool = True  # Whether to sample during generation
+temperature: float = 0.3  # Temperature coefficient during generation
 top_k: int = 20 
 top_p: float = 0.7
-repetition_penalty: float = 1.
-num_beams: int = 1
-stop_words: List[str] = None
-quant_method ['bnb', 'hqq', 'eetq', 'awq', 'gptq', 'aqlm'] # 模型的量化方式
-quantization_bit [0, 1, 2, 3, 4, 8] 默认是0，代表不使用量化
+repetition_penalty: float = 1.  # Penalty for repetition
+num_beams: int = 1  # Number of beams for beam search
+stop_words: List[str] = None  # List of stop words
+quant_method ['bnb', 'hqq', 'eetq', 'awq', 'gptq', 'aqlm']  # Quantization method for the model
+quantization_bit [0, 1, 2, 3, 4, 8]  # Default is 0, which means no quantization is used
 ```
 3. Example:
 ``` shell
@@ -48,36 +48,36 @@ The following demonstrates using Python code to initiate inference with the Mini
 
 ```python
 import os
-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' # 设置显卡数
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'  # Set the number of GPUs to use
 
 from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType,
     get_default_template_type, inference_stream
-) # 导入必要模块
+)  # Import necessary modules
 
-from swift.utils import seed_everything # 设置随机种子
+from swift.utils import seed_everything  # Set random seed
 import torch
 
 model_type = ModelType.minicpm_v_v2_5_chat
-template_type = get_default_template_type(model_type) # 获取模板类型，主要是用于特殊token的构造和图像的处理流程
+template_type = get_default_template_type(model_type)  # Obtain the template type, primarily used for constructing special tokens and image processing workflow
 print(f'template_type: {template_type}')
 
 model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16,
-                                    model_id_or_path='/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5',
-                                    model_kwargs={'device_map': 'auto'}) # 加载模型，并设置模型类型，模型路径，模型参数，设备分配等，计算精度等等
+                                       model_id_or_path='/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5',
+                                       model_kwargs={'device_map': 'auto'})  # Load the model, set model type, model path, model parameters, device allocation, etc., computation precision, etc.
 model.generation_config.max_new_tokens = 256
-template = get_template(template_type, tokenizer) # 根据模版类型构造模板
+template = get_template(template_type, tokenizer)  # Construct the template based on the template type
 seed_everything(42)
 
-images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/road.png'] # 图片地址
-query = '距离各城市多远？'
-response, history = inference(model, template, query, images=images) # 推理获得结果
+images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/road.png']  # Image URL
+query = '距离各城市多远？'  # Note: Query is still in Chinese, consider translating if needed
+response, history = inference(model, template, query, images=images)  # Obtain results through inference
 print(f'query: {query}')
 print(f'response: {response}')
 
-# 流式
-query = '距离最远的城市是哪？'
-gen = inference_stream(model, template, query, history, images=images) # 调用流式输出接口
+# Streaming output
+query = '距离最远的城市是哪？'  # Note: Query is still in Chinese, consider translating if needed
+gen = inference_stream(model, template, query, history, images=images)  # Call the streaming output interface
 print_idx = 0
 print(f'query: {query}\nresponse: ', end='')
 for response, history in gen:
@@ -92,9 +92,9 @@ print(f'history: {history}')
 SWIFT supports training on the local dataset,the training steps are as follows:
 1. Make the train data like this:
 ```jsonl
-{"query": "这张图片描述了什么", "response": "这张图片有一个大熊猫", "images": ["local_image_path"]}
-{"query": "这张图片描述了什么", "response": "这张图片有一个大熊猫", "history": [], "images": ["image_path"]}
-{"query": "竹子好吃么", "response": "看大熊猫的样子挺好吃呢", "history": [["这张图有什么", "这张图片有大熊猫"], ["大熊猫在干嘛", "吃竹子"]], "images": ["image_url"]}
+{"query": "What does this picture describe?", "response": "This picture has a giant panda.", "images": ["local_image_path"]}
+{"query": "What does this picture describe?", "response": "This picture has a giant panda.", "history": [], "images": ["image_path"]}
+{"query": "Is bamboo tasty?", "response": "It seems pretty tasty judging by the panda's expression.", "history": [["What's in this picture?", "There's a giant panda in this picture."], ["What is the panda doing?", "Eating bamboo."]], "images": ["image_url"]}
 ```
 2. LoRA Tuning: