diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json index f970aeb..da93c7d 100644 --- a/.obsidian/workspace.json +++ b/.obsidian/workspace.json @@ -13,13 +13,13 @@ "state": { "type": "markdown", "state": { - "file": "docs/chapter8/bert/modeling/modeling.md", + "file": "docs/chapter6/financial_report/financial_report.md", "mode": "source", "backlinks": false, "source": false }, "icon": "lucide-file", - "title": "modeling" + "title": "financial_report" } } ] @@ -68,10 +68,10 @@ "state": { "type": "outline", "state": { - "file": "docs/chapter8/bert/modeling/modeling.md" + "file": "docs/chapter6/financial_report/financial_report.md" }, "icon": "lucide-list", - "title": "modeling 的大纲" + "title": "financial_report 的大纲" } } ] @@ -87,10 +87,10 @@ }, "active": "a00f9c294cc735a6", "lastOpenFiles": [ + "docs/chapter8/bert/modeling/modeling.md", "docs/chapter8/repositories/repositories.md", "docs/chapter8/repositories_index.md", "docs/chapter8/bert/tokenization/tokenization.md", - "docs/chapter8/bert/modeling/modeling.md", "docs/chapter8/bert/configuration/configuration.md", "docs/chapter1/dataset_tour/datasets.md", "docs/chapter8/bert/tokenization/tokenizer.md", diff --git a/.vscode/settings.json b/.vscode/settings.json index f3f6b3d..93fbfa9 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "markdownlint.config": { "MD010": false, + "MD033": false } } \ No newline at end of file diff --git a/assets/thumbnail.png b/assets/thumbnail.png new file mode 100644 index 0000000..22a2d7c Binary files /dev/null and b/assets/thumbnail.png differ diff --git a/docker-compose/Dockerfile b/docker-compose/Dockerfile index 7f1d7a0..3e00b8a 100644 --- a/docker-compose/Dockerfile +++ b/docker-compose/Dockerfile @@ -10,12 +10,16 @@ RUN apt-get update && apt-get install -y \ wget \ curl \ git \ + unzip \ + inetutils-ping \ + tmux \ && apt-get clean COPY --from=miniconda-stage /opt/conda /opt/conda ENV PATH="/opt/conda/bin:${PATH}" WORKDIR /root +RUN echo "export HF_ENDPOINT=https://hf-mirror.com" >> /root/.bashrc COPY condarc /root/.condarc COPY Dockerfile /root/dockerfile/Dockerfile COPY pip.conf /root/.pip/pip.conf diff --git a/docs/appendix/env_config/env.md b/docs/appendix/env_config/env.md index 6bc27eb..a7f4911 100644 --- a/docs/appendix/env_config/env.md +++ b/docs/appendix/env_config/env.md @@ -15,13 +15,13 @@ title: 环境配置 注意:选择下载的 Miniconda3 版本需要和电脑处理器的架构吻合。为了方便,在此下方直接提供各大操作系统推荐的下载链接。 -| 系统 | 下载地址 | -| :---: | --- | -| Windows | | -| macOS(Intel) | | -| macOS(M/ARM) | | -| Linux(x64) | | -| Linux(ARM) | | +| 系统 | 下载地址 | +| :------------: | -------------------------------------------------------------------------- | +| Windows | | +| macOS(Intel) | | +| macOS(M/ARM) | | +| Linux(x64) | | +| Linux(ARM) | | ### 安装 Miniconda @@ -222,7 +222,7 @@ auto_activate_base: false ![conda_activate_env](./imgs/conda_activate_env.png){ width="600" } - 安装包:`conda install package_name`或者`pip install package_name` -- `pip`在安装包时临时更换镜像源:`pip install package_name -i https://pypi.tuna.tsinghua.edu.cn/simple` + - `pip`在安装包时临时更换镜像源:`pip install package_name -i https://pypi.tuna.tsinghua.edu.cn/simple` - 卸载包:`conda remove package_name`或者`pip uninstall package_name` - 显示所有安装的包:`conda list` - 删除指定虚拟环境:`conda remove -n env_name --all` @@ -235,7 +235,7 @@ auto_activate_base: false ## 安装函数库 ???+ warning - 💯 当你想在虚拟环境安装包的时候,确认你正处在正确的虚拟环境中!! + :100:当你想在虚拟环境安装包的时候,确认你正处在正确的虚拟环境中!! ```bash title='pip/conda' pip install numpy pandas matplotlib transformers datasets peft evaluate diffusers gradio torch jupyterlab diff --git a/docs/chapter1/dataset_tour/datasets.md b/docs/chapter1/dataset_tour/datasets.md index 143b02d..ca264d2 100644 --- a/docs/chapter1/dataset_tour/datasets.md +++ b/docs/chapter1/dataset_tour/datasets.md @@ -93,7 +93,7 @@ data = load_dataset("hfl/cmrc2018") 通过返回结果可以看出 `data` 的数据类型为 `DatasetDict`,它是 `Datasets` 库中重要的数据类型。 -!!! Note "train_test_split" +!!! note "train_test_split" 并非所有数据集都包含训练集、验证集和测试集。有些数据集可能只有一个或两个子集。 对于数据集 `hfl/cmrc2018` 存在训练集、验证集和测试集。但是对于 `LooksJuicy/ruozhiba` 却只存在训练集。 @@ -180,7 +180,7 @@ Dataset({ ``` -!!! Note "配置" +!!! note "配置" ### 配置 (Configurations) diff --git a/docs/chapter6/code_index.md b/docs/chapter6/code_index.md index 301ad34..dd3669f 100644 --- a/docs/chapter6/code_index.md +++ b/docs/chapter6/code_index.md @@ -5,9 +5,12 @@ title: 索引 主页 -- 多标签分类:[面对有害言论, 是时候让AI重拳出击了](./mlcoftc/multi-label-classification-of-toxic-comments.md) -- 抽取式阅读理解:[CMRC2018](./cmrc/cmrc.md) -- 文本摘要:[LCSTS短文本新闻摘要](./text-summary/text-summary.md) -- 集装箱编号位置检测:[DETR目标检测](./container-detr/container-detr.md) -- 文本翻译:[中英文本翻译](./translation/translation.md) -- 简单去噪:[ddpm-unet](./ddpm-unet-mnist/ddpm-unet-mnist.md) +| 模块名称 | 链接 | +| ------------------ | ----------------------------------------------------------------------------------------------- | +| 多标签分类 | [面对有害言论,是时候让AI重拳出击了](./mlcoftc/multi-label-classification-of-toxic-comments.md) | +| 抽取式阅读理解 | [CMRC2018](./cmrc/cmrc.md) | +| 文本摘要 | [LCSTS短文本新闻摘要](./text-summary/text-summary.md) | +| 集装箱编号位置检测 | [目标检测](./container-detr/container-detr.md) | +| 文本翻译 | [中英文本翻译](./translation/translation.md) | +| 扩散去噪 | [ddpm-unet简单去噪](./ddpm-unet-mnist/ddpm-unet-mnist.md) | +| 文本分类 | [基金年报问答意图识别](./financial_report/financial_report.md) | diff --git a/docs/chapter6/ddpm-unet-mnist/ddpm-unet-mnist.md b/docs/chapter6/ddpm-unet-mnist/ddpm-unet-mnist.md index 547808b..02502e3 100644 --- a/docs/chapter6/ddpm-unet-mnist/ddpm-unet-mnist.md +++ b/docs/chapter6/ddpm-unet-mnist/ddpm-unet-mnist.md @@ -25,7 +25,7 @@ import pandas as pd from matplotlib import pyplot as plt ``` -### 数据集的加载 +### 加载数据集 ```python class MnistDataset: diff --git a/docs/chapter6/financial_report/financial_report.md b/docs/chapter6/financial_report/financial_report.md new file mode 100644 index 0000000..22c5afa --- /dev/null +++ b/docs/chapter6/financial_report/financial_report.md @@ -0,0 +1,315 @@ +--- +comments: true +title: 基金年报问答意图识别 +--- + +![qwen2](./imgs/qwen2.png) + +## 前言 + +!!! quote "引用" + 该篇代码摘抄自蚂蚁团队的`DBGPT`项目中的金融商业分析案例。原始开源地址可以查看[:material-github:DB-GPT-Hub](https://github.com/eosphoros-ai/DB-GPT-Hub/blob/main/src/dbgpt-hub-nlu/dbgpt_hub_nlu/intent.py)。 + +大模型在财务报表分析中的应用正成为垂直领域的热门方向:fire:。 + +尽管大模型能够高效理解复杂财务规则并生成合理的分析结果,但由于财务报表信息庞大且复杂,对数据分析的精准度要求很高,传统的通用`RAG`和`Agent`的解决方案往往难以满足需求。 + +例如,在处理具体查询(如季度营业净利润)时,传统方法通过向量检索召回相关文本,但财报中多处信息可能引发误判。 + +此外,涉及财务指标计算时(如毛利率、净利率),需要综合多方面数据进行分析,这进一步增加了复杂性。 + +为解决这些问题,可以结合财务领域的专业知识,添加**专门的外部模块**进行功能增强。 + +本文的所做的文本分类任务旨在解决大模型在意图识别任务中存在的模糊性问题,提升其对具体意图的精准识别能力。 + +!!! note "小结" + 大模型在大多数领域确实是能够取得不错的效果,但是在特定领域,仍然需要结合传统的方法,进行功能增强,比如: + + - AI+导航:如果单纯使用大模型进行地名的提取,在粗粒度的情况下,识别国家、省份或城市等地名,大模型能够取得极好的效果,但是在细粒度的情况下,如从这句话`MSC HAMBURG号将于11月25日停靠上海洋山港三期码头D1泊位,卸货至6号仓库。`提取`POI`点,大模型往往错误提取到`海洋山港三期码头D1泊位`,或者`6号仓库`,但是仅仅将这些传递给导航系统,往往无法找到正确的位置。 + - AI+问答:大语言模型是擅长做问答的,但是从实际业务场景来看,用户的输入往往是非标准的,带有大量的噪声,甚至是缺失,错误的信息,因此往往需要结合**查询改写**,**重排序**等功能,将问题标准化。 + - $\cdots$ + +## 代码 + +### 导入函数库 + +```python +import numpy as np +import pandas as pd + +from datasets import Dataset +from sklearn.metrics import precision_score, recall_score, f1_score +import torch +from peft import PeftConfig, PeftModel, LoraConfig, TaskType, get_peft_model +from transformers import ( + Qwen2ForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) +``` + +### 读取数据集 + +```python +df = pd.read_json( + "./data/financial_report.jsonl", + lines=True, + orient="records", +) +``` + +部分样本数据如下: + +| question | intent | +| :----------------------------------------------------------------------------------------------------- | :--------------- | +| 能否根据2020年金宇生物技术股份有限公司的年报,给我简要介绍一下报告期内公司的社会责任工作情况? | 报告解读分析 | +| 请根据江化微2019年的年报,简要介绍报告期内公司主要销售客户的客户集中度情况,并结合同行业情况进行分析。 | 报告解读分析 | +| 2019年四方科技电子信箱是什么? | 年报基础信息问答 | +| 研发费用对公司的技术创新和竞争优势有何影响? | 专业名称解释 | +| 康希诺生物股份公司在2020年的资产负债比率具体是多少,需要保留至小数点后两位? | 财务指标计算 | +| $\dots$ | $\dots$ | + +```python title="label2id" +{ + "报告解读分析": 0, + "年报基础信息问答": 1, + "专业名称解释": 2, + "财务指标计算": 3, + "统计对比分析": 4, + "其他问题": 5, +} +``` + +```python title="id2label" +{ + 0: "报告解读分析", + 1: "年报基础信息问答", + 2: "专业名称解释", + 3: "财务指标计算", + 4: "统计对比分析", + 5: "其他问题", +} +``` + +```python +df["labels"] = df["intent"].map(labels2id) +``` + +利用 `pandas` 的映射函数,将 `intent` 列文本数据转化为对应的数字标签,并设置列名为 `labels` 。转换后的样本数据案例为: + +| question | intent | labels | +| :----------------------------------------------------------------------------------------------------- | :--------------- | ------: | +| 能否根据2020年金宇生物技术股份有限公司的年报,给我简要介绍一下报告期内公司的社会责任工作情况? | 报告解读分析 | 0 | +| 请根据江化微2019年的年报,简要介绍报告期内公司主要销售客户的客户集中度情况,并结合同行业情况进行分析。 | 报告解读分析 | 0 | +| 2019年四方科技电子信箱是什么? | 年报基础信息问答 | 1 | +| 研发费用对公司的技术创新和竞争优势有何影响? | 专业名称解释 | 2 | +| 康希诺生物股份公司在2020年的资产负债比率具体是多少,需要保留至小数点后两位? | 财务指标计算 | 3 | +| $\dots$ | $\dots$ | $\dots$ | + +将数据集转化为`datasets.Dataset`对象: + +```python +ds = Dataset.from_pandas(df) + +ds = ds.train_test_split(test_size=0.2, shuffle=True, seed=2024) +``` + +### 加载模型 + +```python +model = Qwen2ForSequenceClassification.from_pretrained( + "Qwen/Qwen2-1.5B-Instruct", + num_labels=len(id2labels), + id2label=id2labels, + label2id=labels2id, +) + +peft_config = LoraConfig( + task_type=TaskType.SEQ_CLS, + inference_mode=False, + r=8, + lora_alpha=32, + lora_dropout=0.1, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ], +) + +model = get_peft_model(model, peft_config) +``` + +使用参数高效微调后打印可训练参数如下: + +```python title="model.print_trainable_parameters()" +trainable params: 4,404,480 || all params: 498,442,624 || trainable%: 0.8836 +``` + +### 数据预处理 + +```python +def tokenize_and_align_labels(examples, tokenizer, max_length): + tokenized_inputs = tokenizer( + examples["question"], + padding=True, + max_length=max_length, + truncation=True, + ) + return tokenized_inputs +``` + +该函数用于数据编码。 + +### 加载分词器 + +```python +tokenizer = AutoTokenizer.from_pretrained( + "Qwen/Qwen2-1.5B-Instruct", + pad_token="<|endoftext|>", + trust_remote_code=True, +) + +tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"}) +model.config.pad_token_id = tokenizer.pad_token_id +``` + +许多语言模型,尤其是生成模型,通常使用特殊的标记(例如 `<|endoftext|>`)表示文本的结束,但未必专门定义了 `pad_token`。 + +### 定义评价指标 + +```python +def compute_metrics(p): + preds = np.argmax(p.predictions, axis=1) + labels = p.label_ids + accuracy = (preds == labels).mean() + precision = precision_score(labels, preds, average="weighted") + recall = recall_score(labels, preds, average="weighted") + f1 = f1_score(labels, preds, average="weighted") + return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} +``` + +### 数据集处理 + +```python +tokenized_ds = ds.map(lambda x: tokenize_and_align_labels(x, tokenizer, None), batched=True) +``` + +- 利用分词器对文本进行批量编码。 + +### 定义动态数据整理 + +```python +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True) +``` + +### 定义训练参数 + +```python +training_args = TrainingArguments( + output_dir="./output", + num_train_epochs=2, + per_device_train_batch_size=32, + learning_rate=1e-4, + weight_decay=0.01, + do_train=True, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) +``` + +### 定义训练器 + +```python +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_ds["train"], + eval_dataset=tokenized_ds["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) +``` + +### 训练 + +```python +trainer.train() +``` + +下面是训练时的过程结果。 + +| 轮次 | 评估损失 | 准确率 | 精确率 | 召回率 | F1值 | +| ---- | -------- | ------ | ------ | ------ | ------ | +| 1 | 0.0151 | 0.9985 | 0.9985 | 0.9985 | 0.9985 | +| 2 | 0.0156 | 0.9985 | 0.9985 | 0.9985 | 0.9985 | + +### 推理 + +```python +adapter_path = './output' +peft_config = PeftConfig.from_pretrained(adapter_path) + +model = Qwen2ForSequenceClassification.from_pretrained( + peft_config.base_model_name_or_path, + num_labels=len(label2id) +) + +tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path) +model = PeftModel.from_pretrained(model, adapter_path) +# merge and unload is necessary for inference +model = model.merge_and_unload() + +model.config.pad_token_id = tokenizer.pad_token_id +device = 'cuda' if torch.cuda.is_available() else 'cpu' +model = model.to(device) + +def infer(question): + inputs = tokenizer( + question, + padding="longest", + max_length=512, + truncation=True, + return_tensors="pt", + ).to(device) + with torch.no_grad(): + outputs = model(**inputs) + predictions = torch.argmax(outputs.logits, dim=-1) + return predictions.cpu().numpy()[0] + +question = 'xx股份公司在2024年的资产负债比率具体是多少' +prediction = infer(question) +intent_label = {v: k for k, v in label2id.items()}[prediction] +``` + +| 问题 | 预测 | +| ------------------------------------------ | ---------------- | +| xx股份公司在2024年的资产负债比率具体是多少 | 财务指标计算 | +| 2019年四方科技电子信箱是什么 | 年报基础信息问答 | +| $\cdots$ | $\cdots$ | + +## 参考资料 + +
+ +- 开源的AI原生数据应用开发框架 + + --- + + [:material-github:DB-GPT](https://github.com/eosphoros-ai/DB-GPT) + +- 商业数据分析案例 + + --- + + [:bird:基于DB-GPT的财报分析助手](https://www.yuque.com/eosphoros/dbgpt-docs/cmogrzbtmqf057oe) + +
diff --git a/docs/chapter6/financial_report/financial_report.py b/docs/chapter6/financial_report/financial_report.py new file mode 100644 index 0000000..6d11313 --- /dev/null +++ b/docs/chapter6/financial_report/financial_report.py @@ -0,0 +1,132 @@ +import pandas as pd +from datasets import Dataset +import numpy as np +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score + +from transformers import ( + Qwen2ForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) +from peft import LoraConfig, TaskType, get_peft_model + + +# def get_device(): +# return torch.device("cuda:7" if torch.cuda.is_available() else "cpu") +# +# device = get_device() + +df = pd.read_json( + "./data/financial_report.jsonl", + lines=True, + orient="records", +) + +id2labels = dict(zip(pd.factorize(df["intent"])[0].tolist(), df["intent"])) +labels2id = {v: k for k, v in id2labels.items()} + +df["labels"] = df["intent"].map(labels2id) + +ds = Dataset.from_pandas(df) + +ds = ds.train_test_split(test_size=0.2, shuffle=True, seed=2024) + +model = Qwen2ForSequenceClassification.from_pretrained( + "/data/czq/tjx/unlock-hf/model", + num_labels=len(id2labels), + id2label=id2labels, + label2id=labels2id, +) + +peft_config = LoraConfig( + task_type=TaskType.SEQ_CLS, + inference_mode=False, + r=8, + lora_alpha=32, + lora_dropout=0.1, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ], +) + +model = get_peft_model(model, peft_config) + +model.print_trainable_parameters() + + +def tokenize_and_align_labels(examples, tokenizer, max_length): + tokenized_inputs = tokenizer( + examples["question"], + padding=True, + max_length=max_length, + truncation=True, + ) + return tokenized_inputs + + +tokenizer = AutoTokenizer.from_pretrained( + "/data/czq/tjx/unlock-hf/model", + pad_token="<|endoftext|>", + trust_remote_code=True, +) + +tokenizer.add_special_tokens({"pad_token": "<|endoftext|>"}) + +intent_dict = id2labels + + +def compute_metrics(p): + preds = np.argmax(p.predictions, axis=1) + labels = p.label_ids + accuracy = (preds == labels).mean() + precision = precision_score(labels, preds, average="weighted") + recall = recall_score(labels, preds, average="weighted") + f1 = f1_score(labels, preds, average="weighted") + return { + "accuracy": accuracy, + "precision": precision, + "recall": recall, + "f1": f1, + } + + +model.config.pad_token_id = tokenizer.pad_token_id + +tokenized_ds = ds.map( + lambda x: tokenize_and_align_labels(x, tokenizer, None), + batched=True, +) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True) + +training_args = TrainingArguments( + output_dir="./output", + num_train_epochs=2, + per_device_train_batch_size=32, + learning_rate=1e-4, + weight_decay=0.01, + do_train=True, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_ds["train"], + eval_dataset=tokenized_ds["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() diff --git a/docs/chapter6/financial_report/imgs/qwen2.png b/docs/chapter6/financial_report/imgs/qwen2.png new file mode 100644 index 0000000..6a06fb7 Binary files /dev/null and b/docs/chapter6/financial_report/imgs/qwen2.png differ diff --git a/docs/chapter8/bert/modeling/modeling.md b/docs/chapter8/bert/modeling/modeling.md index 9f890ec..632d5ca 100644 --- a/docs/chapter8/bert/modeling/modeling.md +++ b/docs/chapter8/bert/modeling/modeling.md @@ -148,3 +148,180 @@ class BertEmbeddings(nn.Module): 4. 使用段落嵌入层获取段落向量。 5. 将词向量、段落向量和位置编码求和。 6. 将求和结果进行层归一化$\rightarrow$随机丢弃$\rightarrow$返回结果。 + +### `BertSelfAttention` + +#### 初始化 + +```python +class BertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(): + ... +``` + +该段代码: + +1. 首先判断隐藏层大小能否被自注意力头数整除,同时是否满足没有`embedding_size`属性,如果不满足条件则抛出异常。 +2. 初始化自注意力头个数`num_attention_heads`。 +3. 初始化每个自注意力头的维度`attention_head_size`。 +4. 初始化所有自注意力头的大小`all_head_size`。 +5. 初始化`q`、`k`、`v`的线性层。维度为`hidden_size`到`all_head_size`。 +6. 初始化随机丢弃层。丢弃概率为`attention_probs_dropout_prob`。 +7. 获取位置编码类型,默认为绝对位置编码。 +8. 如果位置编码类型为相对位置编码,则初始化距离编码层。 +9. 初始化是否为解码器。 + +#### transpose_for_scores + +```python +class BertSelfAttention(nn.Module): + def __init__(): + ... + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) +``` + +该段代码: + +1. 在实际运算中,`x`代表经过`q`,`k`,`v`之一处理过后的张量,其维度为`(batch_size, seq_length, hidden_size)`。 +2. 通过索引`x`的前两个维度,再结合自注意力头数和每个自注意力头的维度,将`new_x_shape`设置为`(batch_size, seq_length, num_attention_heads, attention_head_size)`。 +3. 将`x`按照`new_x_shape`的维度进行数据变化,变化后的维度为`(batch_size, seq_length, num_attention_heads, attention_head_size)`。 +4. 通过`permute`函数,将`x`的维度变化为`(batch_size, num_attention_heads, seq_length, attention_head_size)`。 + +#### forward + +```python +class BertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + ... + def transpose_for_scores(): + ... + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs +``` diff --git a/mkdocs.yml b/mkdocs.yml index 77f55d0..05e3e98 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,8 +8,11 @@ extra_javascript: - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js markdown_extensions: - - admonition - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - admonition - md_in_html - pymdownx.arithmatex: generic: true @@ -74,6 +77,7 @@ nav: - 目标检测: "chapter6/container-detr/container-detr.md" - 文本翻译: "chapter6/translation/translation.md" - 一种简单的去噪方法: 'chapter6/ddpm-unet-mnist/ddpm-unet-mnist.md' + - 文本分类: "chapter6/financial_report/financial_report.md" - Gradio工具: - 索引: 'chapter7/gradio_index.md' - Gradio: 'chapter7/gradio/gradio_tour.md'