
回复
在 Qwen3 Embedding 的技术体系中,数据生成模块通过大模型驱动的结构化合成框架突破了传统文本嵌入模型的训练数据瓶颈。这一创新不仅使模型在 MTEB 多语言排行榜以 70.58 分登顶,更在代码检索等专业领域实现 80.68 分的突破。以下结合官方技术报告与开源实践,详解数据生成的全流程技术细节。
Qwen3 采用 Qwen3-32B 大模型为文档生成结构化语义配置,通过三大维度控制查询语义特征:
配置生成示例(医学文档):
{
"Character": "cardiologist",
"Question_Type": "diagnosis",
"Difficulty": "university"
}
基于配置信息生成自然语言查询时,通过四大参数实现精准控制:
查询生成示例(配置映射):
# 输入配置
config = {
"Character": "farmer",
"Question_Type": "keywords",
"Difficulty": "high_school",
"Language": "Chinese",
"Length": 12
}
# 生成查询
query = "春季养鸭饲料调整方法"
第一阶段通过 Qwen3-32B 生成1.5 亿对弱监督文本对,覆盖四大任务类型:
通过三层筛选机制从 1.5 亿数据中提取1200 万高质量样本:
筛选效果对比:
指标 | 原始合成数据 | 筛选后数据 |
相关度准确率 | 68.3% | 89.7% |
领域覆盖度 | 18 个领域 | 27 个领域 |
负样本有效性 | 0.12 | 0.235 |
在农业、金融等垂直领域采用领域知识注入技术:
针对代码检索任务设计三层代码语义生成框架:
代码数据生成效果:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
class Qwen3DataGenerator:
def __init__(self, model_path="Qwen/Qwen3-32B"):
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
self.persona_hub = ["farmer", "researcher", "engineer", "doctor", "teacher"]
def generate_config(self, document, language="English"):
"""生成文档的三维配置"""
prompt = f"""
Given a document, generate a JSON configuration with Character, Question_Type, Difficulty.
Document: {document}
Language: {language}
Character candidates: {self.persona_hub}
Question_Type options: keywords, acquire_knowledge, summary, yes_or_no, background
Difficulty options: high_school, university, phd
Output JSON:
"""
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=100,
temperature=0.7,
top_p=0.9
)
config_json = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return eval(config_json) # 解析生成的JSON
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import numpy as np
class QueryGenerator:
def __init__(self, embedding_model="Qwen/Qwen3-Embedding-0.6B"):
self.embedding_model = SentenceTransformer(embedding_model)
def generate_query(self, config, document):
"""根据配置生成查询"""
role = config["Character"]
q_type = config["Question_Type"]
difficulty = config["Difficulty"]
lang = config.get("Language", "Chinese")
# 构建生成提示词
prompt = f"""
Generate a {lang} query based on the document and config.
Document: {document}
Character: {role}
Question_Type: {q_type}
Difficulty: {difficulty}
Query:
"""
# 这里使用Qwen3-32B生成查询,实际应用中需替换为真实生成逻辑
return f"示例查询: 根据{role}需求生成的{q_type}查询"
def filter_high_quality_pairs(self, pairs, threshold=0.7):
"""筛选高相似度数据对"""
queries = [p["query"] for p in pairs]
docs = [p["document"] for p in pairs]
# 生成嵌入向量
query_emb = self.embedding_model.encode(queries, batch_size=32)
doc_emb = self.embedding_model.encode(docs, batch_size=32)
# 计算余弦相似度
similarities = util.cos_sim(query_emb, doc_emb).diag().cpu().numpy()
# 筛选相似度>阈值的样本
high_quality_indices = np.where(similarities > threshold)[0]
return [pairs[i] for i in high_quality_indices]
from datasets import Dataset
import pandas as pd
def mine_hard_negatives(dataset_path, output_path):
"""挖掘难负样本"""
# 加载数据集
data = pd.read_json(dataset_path)
dataset = Dataset.from_pandas(data)
# 加载Qwen3嵌入模型
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
# 挖掘难负样本
hard_negatives = util.mine_hard_negatives(
dataset,
model,
anchor_column_name="query",
positive_column_name="document",
num_negatives=5,
range_min=20,
range_max=50,
max_score=0.8,
absolute_margin=0.1,
sampling_strategy="top",
batch_size=64
)
# 保存结果
hard_negatives_dataset = Dataset.from_dict(hard_negatives)
hard_negatives_dataset.to_json(output_path, orient="records", lines=True)
# 执行难负样本挖掘
mine_hard_negatives(
dataset_path="agriculture_data.json",
output_path="agriculture_hard_negatives.json"
)
Qwen3 的数据生成技术标志着嵌入模型训练进入大模型自循环时代:通过 LLM 生成 LLM 训练数据的闭环模式,使数据质量和多样性得到指数级提升。这种范式创新带来三方面行业变革:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd
import os
class Qwen3DataGenerator:
def __init__(self, model_path="Qwen/Qwen3-32B"):
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
self.persona_hub = ["farmer", "researcher", "engineer", "doctor", "teacher"]
def generate_config(self, document, language="English"):
prompt = f"""
Given a document, generate a JSON configuration with Character, Question_Type, Difficulty.
Document: {document}
Language: {language}
Character candidates: {self.persona_hub}
Question_Type options: keywords, acquire_knowledge, summary, yes_or_no, background
Difficulty options: high_school, university, phd
Output JSON:
"""
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=100,
temperature=0.7,
top_p=0.9
)
config_json = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return eval(config_json)
class QueryGenerator:
def __init__(self, embedding_model="Qwen/Qwen3-Embedding-0.6B"):
self.embedding_model = SentenceTransformer(embedding_model)
def generate_query(self, config, document):
role = config["Character"]
q_type = config["Question_Type"]
difficulty = config["Difficulty"]
lang = config.get("Language", "Chinese")
return f"示例查询: 作为{role},{q_type}类型的{difficulty}难度问题:{document[:20]}..."
def filter_high_quality_pairs(self, pairs, threshold=0.7):
queries = [p["query"] for p in pairs]
docs = [p["document"] for p in pairs]
query_emb = self.embedding_model.encode(queries, batch_size=32)
doc_emb = self.embedding_model.encode(docs, batch_size=32)
similarities = util.cos_sim(query_emb, doc_emb).diag().cpu().numpy()
high_quality_indices = np.where(similarities > threshold)[0]
return [pairs[i] for i in high_quality_indices]
def mine_hard_negatives(dataset_path, output_path):
data = pd.read_json(dataset_path)
dataset = Dataset.from_pandas(data)
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
hard_negatives = util.mine_hard_negatives(
dataset,
model,
anchor_column_name="query",
positive_column_name="document",
num_negatives=5,
range_min=20,
range_max=50,
max_score=0.8,
absolute_margin=0.1,
sampling_strategy="top",
batch_size=64
)
hard_negatives_dataset = Dataset.from_dict(hard_negatives)
hard_negatives_dataset.to_json(output_path, orient="records", lines=True)
def generate_embedding_data(documents, output_path):
generator = Qwen3DataGenerator()
query_gen = QueryGenerator()
data_pairs = []
for doc in documents:
config = generator.generate_config(doc)
query = query_gen.generate_query(config, doc)
data_pairs.append({
"query": query,
"document": doc,
"config": config
})
high_quality_pairs = query_gen.filter_high_quality_pairs(data_pairs)
Dataset.from_list(high_quality_pairs).to_json(output_path, orient="records", lines=True)
return high_quality_pairs
def run_swift_sft():
os.system("""
swift sft \
--model /path/to/Qwen3-Embedding-0.6B \
--task_type embedding \
--model_type qwen3_emb \
--train_type lora \
--dataset /path/to/agriculture_data.json \
--split_dataset_ratio 0.05 \
--eval_strategy steps \
--output_dir output/qwen3-agriculture \
--eval_steps 100 \
--num_train_epochs 1 \
--save_steps 100 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 4 \
--learning_rate 6e-6 \
--loss_type infonce \
--label_names labels \
--dataloader_drop_last true
""")
def evaluate_model(model_path, test_data_path):
model = SentenceTransformer(model_path)
test_data = load_dataset("json", data_files=test_data_path)["train"]
queries = test_data["query"]
docs = test_data["document"]
query_emb = model.encode(queries, batch_size=32)
doc_emb = model.encode(docs, batch_size=32)
similarities = util.cos_sim(query_emb, doc_emb)
# 计算NDCG等评估指标
return similarities
if __name__ == "__main__":
# 示例文档数据
sample_documents = [
"春季养鸭时,应适当增加能量饲料的比例以应对低温天气",
"发财树春季养护需要保证充足光照",
"食品包装中常用的阻隔性材料主要用于保质保鲜"
]
# 1. 生成数据配置与查询
generated_data = generate_embedding_data(sample_documents, "generated_data.json")
# 2. 挖掘难负样本
mine_hard_negatives("generated_data.json", "hard_negatives.json")
# 3. 执行LoRA微调
run_swift_sft()
# 4. 模型评估
evaluation_results = evaluate_model("output/qwen3-agriculture", "hard_negatives.json")
print("模型评估相似度矩阵:", evaluation_results.shape)
笔者能力有限,欢迎批评指正或者在留言区讨论
本文转载自鸿煊的学习笔记,作者:乘风破浪jxj