diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt
index 8e1440bd9..d3de28c3e 100644
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1400,6 +1400,19 @@ sqlite
 customerservice
 fn
 ExecuTorch
+LLMScore
+RecursiveCharacterTextSplitter
+TPD
+TPM
+Tianjun
+Zhang
+distractor
+distractors
+frac
+numRefusal
+totalQA
+DirectoryLoader
+SitemapLoader
 nf
 quant
 DLAI
diff --git a/recipes/quickstart/finetuning/datasets/raft_dataset.py b/recipes/quickstart/finetuning/datasets/raft_dataset.py
new file mode 100644
index 000000000..9341dd317
--- /dev/null
+++ b/recipes/quickstart/finetuning/datasets/raft_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+
+
+import copy
+from datasets import load_dataset
+import itertools
+
+# check system prompt token seq or user prompt token seq is in the current token list
+def check_header(targets,seq):
+    for i in range(len(seq)-3):
+        if seq[i:i+3] in targets:
+            return True
+    return False
+def replace_target(target,seq):
+    for i in range(len(seq)-3):
+        if seq[i:i+3] == target:
+            seq[i],seq[i+1],seq[i+2] = -100,-100,-100
+    return seq
+def tokenize_dialog(dialog, tokenizer):
+    # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
+    if tokenizer.vocab_size >= 128000:
+        dialog_tokens = tokenizer.apply_chat_template(dialog)
+        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
+        labels = copy.copy(dialog_tokens)
+        last_idx = 0
+        # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
+        # user prompt header "<|start_header_id|>user<|end_header_id|>" has been tokenized to [128006, 882, 128007]
+        prompt_header_seqs = [[128006, 9125, 128007],[128006, 882, 128007]]
+        for n, idx in enumerate(eot_indices):
+            current_seq = labels[last_idx:idx+1]
+            if check_header(prompt_header_seqs,current_seq):
+                # found prompt header, indicating that this seq should be masked
+                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
+            else:
+                last_idx = idx
+        # Lastly mask all the assistant header prompt <|start_header_id|>assistant<|end_header_id|>, which has been tokenized to [128006, 78191, 128007]
+        assistant_header_seq = [128006, 78191, 128007]
+        labels = replace_target(assistant_header_seq,labels)
+        dialog_tokens = [dialog_tokens]
+        labels_tokens = [labels]
+    else:
+        raise Exception("This raft_dataset only supports Llama 3 family models, please make sure the tokenizer is from Llama 3 family models.")
+
+    combined_tokens = {
+        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
+        "labels": list(itertools.chain(*(t for t in labels_tokens))),
+    }
+
+    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
+def raft_tokenize(q_a_pair, tokenizer):
+    end_tag = "</DOCUMENT>"
+    # find the last end_tag in the instruction, the rest is the question
+    try:
+        index =q_a_pair["instruction"].rindex(end_tag)+len(end_tag)
+    except ValueError:
+        print(q_a_pair["instruction"])
+        raise Exception("The instruction does not contain the end tag <\/DOCUMENT>")
+    # all the lines after end_tag are the question
+    question = q_a_pair["instruction"][index:].strip()
+    # all the lines before end_tag are the context
+    documents = q_a_pair["instruction"][:index].strip() 
+    # output is the label
+    answer = q_a_pair["output"]
+    system_prompt = "You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context."
+    user_prompt = """
+        Question: {question}\nContext: {context}\n
+        Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+        - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+        - First provide step-by-step reasoning on how to answer the question.
+        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+        - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+        You MUST begin your final answer with the tag "<ANSWER>:".
+    """.format(question=question, context=documents)
+
+    chat = [
+    {"role": "system", "content": system_prompt},
+    {"role": "user", "content": user_prompt},
+    {"role": "assistant", "content": answer}
+    ]
+    return tokenize_dialog(chat, tokenizer)
+
+
+def get_custom_dataset(dataset_config, tokenizer, split, split_ratio=0.9):
+    # load_dataset will return DatasetDict that contains all the data in the train set
+    dataset_dict = load_dataset('json', data_files=dataset_config.data_path)
+    dataset = dataset_dict['train']
+    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)
+
+    dataset = dataset[split].map(lambda sample: {
+        "instruction": sample["instruction"],
+        "output": sample["cot_answer"],
+        },
+        batched=True,
+    )
+    dataset = dataset.map(lambda x: raft_tokenize(x, tokenizer))
+    return dataset
diff --git a/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb b/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
index fa0013dd1..dc070a295 100644
--- a/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
+++ b/recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
@@ -789,7 +789,7 @@
    "metadata": {},
    "source": [
     "\n",
-    "One good way to quickly obtain labeled training data for a use case is to use the original, non-fine tuned model itself to highlight risky examples to label, while drawing random negatives from below a score threshold. This helps address the class imbalance (attacks and risky prompts can be a very small percentage of all prompts) and includes false positive examples (which tend to be very valuable to train on) in the dataset. The use of synthetic data for specific "
+    "One good way to quickly obtain labeled training data for a use case is to use the original, non-fine tuned model itself to highlight risky examples to label, while drawing random negatives from below a score threshold. This helps address the class imbalance (attacks and risky prompts can be a very small percentage of all prompts) and includes false positive examples (which tend to be very valuable to train on) in the dataset. Generating synthetic fine-tuning data for specific use cases can also be an effective strategy."
    ]
   }
  ],
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md
new file mode 100644
index 000000000..50356d509
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/README.md
@@ -0,0 +1,243 @@
+
+## Chatbot Recipe:
+As the popularity of our Meta Llama 3 models grows, we've seen a surge in demand to adapt them to specific domains, enabling businesses to better serve their customers. For example, a company might have a vast collection of plain text documents related to their custom domain and want to create a chatbot that can answer client questions.
+
+In response to this demand, we're exploring the possibility of building a Llama chatbot that can answer Llama-related questions using our Meta Llama 3 models. In this tutorial, we'll demonstrate how to do just that. While our Meta Llama 3 70B Instruct model is an excellent candidate, its production costs are relatively high. To reduce these costs, we'll focus on creating a Llama chatbot based on the Meta Llama 8B Instruct model, aiming to achieve similar accuracy while minimizing inference costs.
+
+One common approach to produce a model based on new domain data is **fine-tuning**. The idea is to start from a pre-trained model that already has some knowledge of language from its pre-training and adapt it to a new domain. However, [recent paper](https://arxiv.org/pdf/2405.05904) highlights the risk of using supervised fine-tuning to update LLMs' knowledge, as it presents empirical evidence that acquiring new knowledge through fine-tuning is correlated with hallucinations w.r.t. preexisting knowledge. Fine-tuning can also be costly if the domain knowledge has to be updated frequently.
+
+Another solution is to use **RAG (Retrieval-Augmented Generation)**, which combines the strengths of traditional information retrieval systems (such as databases) with the capabilities of generative large language models (LLMs). RAG operates by first retrieving relevant information from a database using a query generated by the LLM. This retrieved information is then integrated into the LLM's query input, enabling it to generate more accurate and contextually relevant text. This helps to reduce LLM hallucination as the related documents are provided to LLM and has a lower cost to update the domain knowledge.
+
+In this tutorial, we'll use **Retrieval Augmented Fine Tuning (RAFT)**, a technique that combines fine-tuning with RAG to better utilize custom domain text data. RAFT is a general recipe for fine-tuning a pre-trained Large Language Model (LLM) to a domain-specific RAG setting. It helps LLM to better utilize custom domain text data, by ignoring those documents that don’t help in answering the question. This approach can create a more factual model and reduce LLM hallucinations during inference.
+
+The process involves preparing training data with each data point containing:
+
+* A question (Q)
+* A set of documents (D)
+* A corresponding Chain-of-thought style answer (A*) generated from one of the documents (D*)
+
+RAFT tries to teach the models to differentiate between two types of documents:
+
+* Oracle documents (D*): documents from which the answer to the question can be deduced
+* Distractor documents (Di): documents that do not contain answer-relevant information
+
+The following graph illustrates the RAFT main concepts:
+![RAFT images](images/RAFT.png)
+
+For more information on RAFT, please refer to their [blog post](https://gorilla.cs.berkeley.edu/blogs/9_raft.html).
+
+## Fine-tuning Llama
+
+To build a Llama bot, we need to collect relevant text data. Ideally, we would include a vast range of Llama-related web documents, but for demo purposes, we'll focus on official documents. For example, we can use the raw text from official web pages listed in [Getting started with Meta Llama](https://llama.meta.com/get-started/), excluding the FAQ page since some evaluation questions will come from there.
+
+We have two options to obtain the text data: using a local folder or web crawling. For the local folder option, we can download the desired documents in PDF, Text, or Markdown format to the "data" folder specified in the [raft.yaml](./raft.yaml) file. Langchain DirectoryLoader will load files in that folder, but it may also ask us to install more package dependency if the files formats are not supported natively.
+
+Alternatively, we can create a sitemap XML file, similar to the example below, and put the file path in the [raft.yaml](./raft.yaml) file, so eventually a Langchain SitemapLoader can retrieve all the text from the web pages.
+
+```xml
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>http://llama.meta.com/responsible-use-guide/</loc>
+  </url>
+  <!-- more URLs -->
+</urlset>
+```
+
+## Create RAFT Dataset
+
+To create a RAFT dataset from the prepared documents, we can use the Meta Llama 3 70B Instruct model either through APIs from LLM cloud providers or by hosting a local VLLM server.
+
+For this example, we'll demonstrate how to create a VLLM OpenAI-compatible server that hosts Meta Llama 3 70B Instruct locally and generates the RAFT dataset.
+
+**Local Server Setup**
+
+First, ensure VLLM is installed. Then, run the following command to start the VLLM server:
+```bash
+CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server  --model meta-Llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 2 --disable-log-requests --port 8001
+```
+**Note**: Make sure the port is available, and the server requires at least 135GB GPU memory, so we need to use multiple GPUs in a tensor parallel way.
+
+**Querying the Server**
+
+Once the server is ready, query it using the following command in another terminal:
+```bash
+python raft.py -u "http://localhost:8001/v1" -k "EMPTY" -t 4
+```
+If you prefer to use a cloud API, replace the endpoint URL with the cloud provider's URL and set the API key using the `-k` flag or environment variables.
+
+**RAFT Dataset Generation**
+
+The [raft.py](raft.py) script reads all documents from local or web sources, depending on the settings, and splits the data into text chunks of 1000 characters using RecursiveCharacterTextSplitter.
+
+Then, it applies the `question_prompt_template` defined in [raft.yaml](raft.yaml) to each chunk to generate queries to Meta Llama 3 70B model, and the model will generate a question list (By default 4 questions in that list) for each text chunk. For each question and corresponding text chunk, we generate a Chain-of-Thought (COT) style answer using Meta Llama 3 70B Instruct APIs.
+
+Once we have the COT answers, we can create a dataset where each sample contains an "instruction" section. This section includes some unrelated chunks called distractors (by default, we add 4 distractors). In the original RAFT method, there is an oracle probability P (by default, 80%) that a related document will be included. This means that there is a 1-P (by default, 20%) chance that no related documents are provided, and the RAFT model should still try to predict the COT answer label, as stated in the blog, "By removing the oracle documents in some instances of the training data, we are compelling the model to memorize domain-knowledge."
+
+**Modification to Add Refusal Examples**
+
+In this tutorial, we made an important modification by adding additional refusal examples (by default, this refusal probability is 5%). When the related documents are not presented, we set the COT answer label to "Sorry, I don't know the answer to this question because related documents are not found. Please try again." Our hypothesis is that this will increase answer precision and reduce chatbot hallucination. In real-world production scenarios, we prefer that the chatbot refuses to answer when not enough context is provided, so that we can detect this refusal signal and mitigate the risk of producing wrong or misleading answers (e.g., we can ask a human agent to take over the conversation to better serve customers).
+
+**RAFT Format JSON Example**
+
+Here is a RAFT format JSON example from our saved `raft.jsonl` file:
+```json
+{
+   "id":"seed_task_228",
+   "type":"general",
+   "question":"What is the context length supported by Llama 3 models?",
+   "context":{
+      "sentences":[
+         [
+            "DISTRACT_DOCS 1"
+            "DISTRACT_DOCS 2"
+            "We hope that Code Llama will inspire others to leverage Llama 2 to create new innovative tools for research and commercial products. Download the model Explore more on Code Llama Discover more about Code Llama here \u2014 visit our resources, ranging from our research paper, getting started guide and more. Code Llama GitHub repository Research paper Download the model Getting started guide Meta Llama 3 Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Get Started Experience Llama 3 on Meta AI Experience Llama 3 with Meta AI We\u2019ve integrated Llama 3 into Meta AI, our intelligent assistant, that expands the ways people can get things done, create and connect with Meta AI. You can see first-hand the performance of Llama 3 by using Meta AI for coding tasks and problem solving. Whether you're developing agents, or other AI-powered applications, Llama 3 in both 8B and 70B will offer the capabilities and flexibility you need to develop your ideas. Experience Llama 3 on Meta AI Enhanced performance Experience the state-of-the-art performance of Llama 3, an openly accessible model that excels at language nuances, contextual understanding, and complex tasks like translation and dialogue generation. With enhanced scalability and performance, Llama 3 can handle  multi-step tasks effortlessly, while our refined post-training processes significantly lower false refusal rates, improve response alignment, and boost diversity in model answers. Additionally, it drastically elevates capabilities like reasoning, code generation, and instruction following. Build the future of AI with Llama 3. Download Llama 3 Getting Started Guide With each Meta Llama request, you will receive: Meta Llama Guard 2 Getting started guide Responsible Use Guide Acceptable use policy Model card Community license agreement Benchmarks Llama 3 models take data and scale to new heights. It\u2019s been trained on our two recently announced custom-built 24K GPU clusters on over 15T token of data \u2013 a training dataset 7x larger than that used for Llama 2, including 4x more code. This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. Model card Trust & safety A comprehensive approach to responsibility With the release of Llama 3, we\u2019ve updated the Responsible Use Guide (RUG) to provide the most comprehensive information on responsible development with LLMs. Our system-centric approach includes updates to our trust and safety tools with Llama Guard 2, optimized to support the newly announced taxonomy published by MLCommons expanding its coverage to a more comprehensive set of safety categories, Code Shield, and Cybersec Eval 2. In line with the principles outlined in our RUG , we recommend thorough checking and filtering of all inputs to and outputs from LLMs based on your unique content guidelines for your intended use case and audience. Meta Llama Guard 2 Explore more on Meta Llama 3 Introducing Meta Llama 3: The most capable openly available LLM to date Read the blog Meet Your New Assistant: Meta AI, Built With Llama 3 Learn more Meta Llama 3 repository View repository Model card Explore Meta Llama 3 License META LLAMA 3 COMMUNITY LICENSE AGREEMENT Meta Llama 3 Version Release Date: April 18, 2024 \u201c Agreement \u201d means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. \u201c Documentation \u201d means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https:\/\/llama.meta.com\/get-started\/ .",
+            "DISTRACT_DOCS 3"
+            "DISTRACT_DOCS 4"
+         ]
+      ],
+      "title":[
+         [
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+            "placeholder_title",
+         ]
+      ]
+   },
+   "oracle_context":"We hope that Code Llama will inspire others to leverage Llama 2 to create new innovative tools for research and commercial products. Download the model Explore more on Code Llama Discover more about Code Llama here \u2014 visit our resources, ranging from our research paper, getting started guide and more. Code Llama GitHub repository Research paper Download the model Getting started guide Meta Llama 3 Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Build the future of AI with Meta Llama 3 Now available with both 8B and 70B pretrained and instruction-tuned versions to support a wide range of applications Get Started Experience Llama 3 on Meta AI Experience Llama 3 with Meta AI We\u2019ve integrated Llama 3 into Meta AI, our intelligent assistant, that expands the ways people can get things done, create and connect with Meta AI. You can see first-hand the performance of Llama 3 by using Meta AI for coding tasks and problem solving. Whether you're developing agents, or other AI-powered applications, Llama 3 in both 8B and 70B will offer the capabilities and flexibility you need to develop your ideas. Experience Llama 3 on Meta AI Enhanced performance Experience the state-of-the-art performance of Llama 3, an openly accessible model that excels at language nuances, contextual understanding, and complex tasks like translation and dialogue generation. With enhanced scalability and performance, Llama 3 can handle  multi-step tasks effortlessly, while our refined post-training processes significantly lower false refusal rates, improve response alignment, and boost diversity in model answers. Additionally, it drastically elevates capabilities like reasoning, code generation, and instruction following. Build the future of AI with Llama 3. Download Llama 3 Getting Started Guide With each Meta Llama request, you will receive: Meta Llama Guard 2 Getting started guide Responsible Use Guide Acceptable use policy Model card Community license agreement Benchmarks Llama 3 models take data and scale to new heights. It\u2019s been trained on our two recently announced custom-built 24K GPU clusters on over 15T token of data \u2013 a training dataset 7x larger than that used for Llama 2, including 4x more code. This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. Model card Trust & safety A comprehensive approach to responsibility With the release of Llama 3, we\u2019ve updated the Responsible Use Guide (RUG) to provide the most comprehensive information on responsible development with LLMs. Our system-centric approach includes updates to our trust and safety tools with Llama Guard 2, optimized to support the newly announced taxonomy published by MLCommons expanding its coverage to a more comprehensive set of safety categories, Code Shield, and Cybersec Eval 2. In line with the principles outlined in our RUG , we recommend thorough checking and filtering of all inputs to and outputs from LLMs based on your unique content guidelines for your intended use case and audience. Meta Llama Guard 2 Explore more on Meta Llama 3 Introducing Meta Llama 3: The most capable openly available LLM to date Read the blog Meet Your New Assistant: Meta AI, Built With Llama 3 Learn more Meta Llama 3 repository View repository Model card Explore Meta Llama 3 License META LLAMA 3 COMMUNITY LICENSE AGREEMENT Meta Llama 3 Version Release Date: April 18, 2024 \u201c Agreement \u201d means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein. \u201c Documentation \u201d means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https:\/\/llama.meta.com\/get-started\/ .",
+   "cot_answer":"Here's the step-by-step reasoning to answer the question:\n\n1. The question asks about the context length supported by Llama 3 models.\n2. In the context, we need to find the relevant information about Llama 3 models and their context length.\n3. The relevant sentence is: \"This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2.\"\n##begin_quote## This results in the most capable Llama model yet, which supports a 8K context length that doubles the capacity of Llama 2. ##end_quote##\n4. From this sentence, we can see that Llama 3 models support a context length of 8K.\n\n<ANSWER>: 8K",
+   "instruction":"<DOCUMENT> DISTRACT_DOCS 1 <\/DOCUMENT>...<DOCUMENT> DISTRACT_DOCS 4 <\/DOCUMENT>\nWhat is the context length supported by Llama 3 models?"
+}
+```
+As shown in the above example, we have a "question" section for the generated question, a "cot_answer" section for the generated COT answers (where the final answer will be added after the "<ANSWER>" token), and an "instruction" section that has all the documents included (each document split by `<DOCUMENT>` and `</DOCUMENT>` tags) and finally the generated question appended at the end. This "instruction" section will be the input during fine-tuning, and the "cot_answer" will be the output label that the loss will be calculated on.
+
+## Creating an Evaluation Set
+To create a reliable evaluation set, it's ideal to use human-annotated question and answer pairs. This ensures that the questions are relevant and the answers are accurate. However, human annotation is time-consuming and costly. For demonstration purposes, we'll use a subset of the validation set, which will never be used in the fine-tuning. We only need to keep the "question" section and the final answer section, marked by the `<ANSWER>` tag in "cot_answer". We'll manually check each example and select only the good ones. We want to ensure that the questions are general enough to be used for web search engine queries and are related to Llama. We'll also use some QA pairs from our FAQ page, with modifications. This will result in 72 question and answer pairs as our evaluation set, saved as `eval_llama.json`.
+
+## Fine-Tuning Steps
+Once the RAFT dataset is ready in JSON format, we can start fine-tuning. Unfortunately, the LORA method didn't produce good results, so we'll use the full fine-tuning method. We can use the following commands as an example in the llama-recipes main folder:
+
+```bash
+export PATH_TO_ROOT_FOLDER=./raft-8b
+export PATH_TO_RAFT_JSON=recipes/use_cases/end2end-recipes/raft/output/raft.jsonl
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --lr 1e-5 --context_length 8192 --num_epochs 1 --batch_size_training 1 --model_name meta-Llama/Meta-Llama-3-8B-Instruct --dist_checkpoint_root_folder $PATH_TO_ROOT_FOLDER --dist_checkpoint_folder fine-tuned  --use_fast_kernels --dataset "custom_dataset" --custom_dataset.test_split "test" --custom_dataset.file "recipes/finetuning/datasets/raft_dataset.py" --use-wandb  --run_validation True  --custom_dataset.data_path $PATH_TO_RAFT_JSON
+```
+
+For more details on multi-GPU fine-tuning, please refer to the [multigpu_finetuning.md](../../../quickstart/finetuning/multigpu_finetuning.md) in the finetuning recipe.
+
+Next, we need to convert the FSDP checkpoint to a HuggingFace checkpoint using the following command:
+
+```bash
+python src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py --fsdp_checkpoint_path  "$PATH_TO_ROOT_FOLDER/fine-tuned-meta-Llama/Meta-Llama-3-8B-Instruct" --consolidated_model_path "$PATH_TO_ROOT_FOLDER"
+```
+
+For more details on FSDP to HuggingFace checkpoint conversion, please refer to the [readme](../../../quickstart/inference/local_inference/README.md) in the inference/local_inference recipe.
+
+## Evaluation Steps
+Once we have the RAFT model, we need to evaluate its performance. In this tutorial, we'll not only use traditional evaluation methods (e.g., calculating exact match rate or ROUGE score) but also use LLM as a judge to score model-generated answers.
+
+We'll launch a VLLM server to host our converted model from `PATH_TO_ROOT_FOLDER`. To make things easier, we can rename the model folder to `raft-8b`.
+
+```bash
+CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server  --model raft-8b --port 8000  --disable-log-requests
+```
+
+Similarly, if we want to get the 8B instruct baseline, we can launch a 8B model VLLM server instead:
+
+```bash
+CUDA_VISIBLE_DEVICES=1 python -m vllm.entrypoints.openai.api_server  --model  meta-Llama/Meta-Llama-3-8B-Instruct --port 8000  --disable-log-requests
+```
+
+On another terminal, we can use another Meta Llama 3 70B Instruct model as a judge to compare the answers from the RAFT 8B model with the ground truth and get a score. To do this, we need to host another Meta Llama 3 70B Instruct VLLM server locally with the command, making sure the port is not in use:
+```bash
+CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model meta-Llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 2 --disable-log-requests --port 8001
+```
+
+Then, we can pass the ports to the eval script to evaluate our RAFT model once our `raft-8b` VLLM server is running:
+```bash
+CUDA_VISIBLE_DEVICES=4 python raft_eval.py -m raft-8b -u "http://localhost:8000/v1" -j "http://localhost:8001/v1" -r 5
+```
+
+To evaluate the 8B baseline, we can use the following command once our 8B VLLM server is running:
+```bash
+CUDA_VISIBLE_DEVICES=4 python raft_eval.py -m meta-Llama/Meta-Llama-3-8B-Instruct -u "http://localhost:8000/v1" -j "http://localhost:8001/v1" -r 5
+```
+
+**NOTE**: Please ensure that the `--model` in VLLM server creation matches the `--m` in raft_eval.py. Otherwise, VLLM will raise a `model not found` error. By default, the RAFT model is called "raft-8b". Here, `-u` specifies the RAFT model endpoint URL, `-j` specifies the judge model endpoint URL, and `-r` defines how many top-k documents the RAG should retrieve.
+
+This [raft_eval.py](./raft_eval.py) script will load questions from the evaluation set, generate answers from models and models+RAG, and compare the generated answers with the ground truth to get the evaluation metrics, such as ROUGE score or LLM-as-judge score. It will then save those metrics and evaluation details to eval logs.
+
+## Experiment Results
+
+**Overview**
+
+During our experiments, we encountered issues with using only the Llama website data, which consisted 1980+ RAFT examples generated from 327K characters text. We believed that this initial data was insufficient, so we created an additional PyTorch RAFT dataset using text from official [Pytorch blogs](https://pytorch.org/blog/) and [Pytorch tutorials](https://pytorch.org/tutorials/). This new dataset contains 20K+ RAFT examples generated from 4.7 million characters. We combined both datasets to create an `all_data` dataset. We then fine-tuned the 8B model on each dataset separately for 1 epoch with a learning rate of 1e-5, resulting in three RAFT models: `llama_only`, `pytorch_only`, and `all_data`.
+
+**Evaluation on non-RAG baseline**
+
+First we run a non-RAG baseline, just using Meta Llama 3 8B Instruct and Meta Llama 3 70B Instruct model to see if our model can already answers some questions without any fine-tuning and external knowledge base. The LLM score, the percentage of correctness marked by LLM_as_judge, for 8B is 47.9% and 70B is 59.2%. Clearly, there are some information that has been pretrained into our Meta Llama 3 models.
+
+**Evaluation on RAG baseline**
+
+Then we tested these 3 RAFT models with Langchain RAG, along with the Meta Llama 3 8B Instruct and Meta Llama 3 70B Instruct RAG baselines, using the RAG document top-k retrieve parameters of 3, 5, and 7. We deployed a Meta Llama 70B Instruct model as the judge to score our model-generated answers against the ground truth in our evaluation set. The LLM scores are shown below:
+
+
+![RAFT LLM_score comparison](images/LLM_score_comparison.png)
+
+Our results showed that RAFT models performed similarly to the 8B RAG baseline, but noticeably worse than the 70B RAG baseline when context documents were limited (top_k <= 5). However, when top_k = 7, the RAFT models performance suddenly increase, with the `all_data` 8B model achieving a score of 76.06% which beats the 70B baseline's 74.65%.
+
+**Refusal Examples**
+
+We also analyzed the number of refusal examples, where the model responded with "Sorry, I do not know." The `all_data` model was more cautious and tended to refuse to answer, whereas the `llama_only` RAFT model did not learn to refuse at all, likely due to the limited dataset size.
+
+![Num of refusal comparison](images/Num_of_refusal_comparison.png)
+
+**Precision Analysis**
+
+We calculated the precision of our model answers, which represents the likelihood of producing correct answers when the model decides to respond. The formula used was $\frac{LLMScore}{1-\frac{numRefusal}{totalQA}}$.
+
+![Answers Precision](images/Answers_Precision.png)
+
+Note that the 8B and 70B RAG baselines never refused to answer, so their precision was equivalent to their LLM_score. Our `all_data` and `pytorch_only` models tended to refuse to answer when provided documents were limited (top_k < 5), but when they did generate an answer, the likelihood of it being correct was higher. Specifically, when top_k = 7, the `all_data` RAFT model had an 82.97% likelihood of producing a correct answer when it decided to respond, outperforming the 70B baseline.
+
+**Example Comparisons**
+
+Here are some examples where our `all_data` RAFT model correctly answered questions that the 70B baseline failed to answer:
+
+```
+Comparing interested question: What tokenizer is used as the basis for the special tokens in Meta Llama
+ground_truth:  tiktoken
+True all_data_RAG_answers: <ANSWER>: The tokenizer used as the basis for the special tokens in Meta Llama is tiktoken.
+False 70B_RAG_answers: <ANSWER>: The tokenizer used as the basis for the special tokens in Meta Llama is SentencePiece.
+```
+
+```
+Comparing interested question: What is the license under which the Llama Guard model and its weights are released?
+groud_truth:  The license is the same as Llama 3, which can be found in the LICENSE file and is accompanied by the Acceptable Use Policy.
+True all_data_RAG_answers: <ANSWER>: The license under which the Llama Guard model and its weights are released is the same as Llama 3, and the [LICENSE](../LICENSE) file contains more information about the license.
+False 70B_RAG_answers: <ANSWER>: The Llama Guard model and its weights are licensed under the Llama 2 Community license.
+```
+
+**Key Takeaways**
+
+From our experiments, we learned:
+
+1. Few thousand RAFT examples are insufficient, and at least 10K examples are recommended.
+2. The LLM_as_judge is not always reliable, and we noticed there are chances that answers were scored incorrectly.
+3. The chunk_size for RAFT documents and RAG documents should be the same.
+4. The RAFT method appears to help the LLM differentiate related documents from distractors rather than forcing it to memorize the training data, as we used Pytorch data as additional data to help our Llama chatbot to answer Llama questions. More research experiments will be needed to understand more about this.
+
+## Local Inference Steps
+
+Once we evaluated and refined our RAFT model, we can deploy it locally to interact with it by asking questions manually. To do this, run the following command:
+
+```bash
+python recipes/inference/local_inference/inference.py --model_name raft-8b
+```
+
+For more details,please check [local_inference recipe](../../../quickstart/inference/local_inference/README.md)
+
+## Acknowledgement
+
+Finally, we would like to extend special thanks to Tianjun Zhang, the first author of the [RAFT paper](https://arxiv.org/pdf/2403.10131), for collaborating with us on this tutorial and providing valuable guidance throughout our experiments. Our code is also partially inspired by the [RAFT section in Gorilla github](https://github.com/ShishirPatil/gorilla/tree/main/raft).
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py
new file mode 100644
index 000000000..8b9115f7d
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import yaml
+
+def load_config(config_path: str = "./config.yaml"):
+    # Read the YAML configuration file
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json
new file mode 100644
index 000000000..1fd66af9b
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/eval_llama.json
@@ -0,0 +1,287 @@
+[
+    {
+       "question":"What is the role of Llama2 70B in generating hard samples?",
+       "answer":" Llama2 70B generates hard samples by producing alternate policy descriptions that flip the label of existing samples."
+    },
+    {
+       "question":"What is the purpose of quantization in machine learning?",
+       "answer":" The purpose of quantization in machine learning is to reduce computational and memory requirements, making models more efficient for deployment."
+    },
+    {
+       "question":"What policy must your use of the Llama Materials adhere to, as specified in this Agreement?",
+       "answer":" The Acceptable Use Policy for the Llama Materials."
+    },
+    {
+       "question":"How is perplexity calculated in the context of fine-tuning a language model?",
+       "answer":" Perplexity is calculated as an exponentiation of the loss value."
+    },
+    {
+       "question":"How can the Memory API be used to enhance the conversational capabilities of an LLM?",
+       "answer":" The Memory API can be used to enhance the conversational capabilities of an LLM by saving conversation history and feeding it along with new questions to the LLM, enabling multi-turn natural conversation chat."
+    },
+    {
+       "question":"What token is used to signify the end of a message in a turn?",
+       "answer":" <|eot_id|>"
+    },
+    {
+       "question":"Where can I find more information about the research behind the Llama-2 model?",
+       "answer":" https:\/\/ai.meta.com\/research\/publications\/llama-2-open-foundation-and-fine-tuned-chat-models\/"
+    },
+    {
+       "question":"What tokenizer is used as the basis for the special tokens in Meta Llama ",
+       "answer":" tiktoken"
+    },
+    {
+       "question":"What does the model do with the probability of the first token to determine safety?",
+       "answer":" The model turns the probability of the first token into an \"unsafe\" class probability to determine safety."
+    },
+    {
+       "question":"Are Meta user data included in the pretraining dataset?",
+       "answer":" No"
+    },
+    {
+       "question":"What are the benefits of quantization in neural networks?",
+       "answer":" The benefits of quantization in neural networks are smaller model sizes, faster fine-tuning, and faster inference."
+    },
+    {
+       "question":"How does the GPTQ algorithm quantize the weight matrix during post-training?",
+       "answer":" The GPTQ algorithm quantizes the weight matrix by quantizing each row independently during post-training."
+    },
+    {
+       "question":"What is the capability of large language models like Meta Llama in terms of following instructions?",
+       "answer":" They can follow instructions without having previously seen an example of a task."
+    },
+    {
+       "question":"What trade-off do developers need to consider when deploying LLM systems, according to the Responsible Use Guide?",
+       "answer":" The trade-off is between model helpfulness and model alignment."
+    },
+    {
+       "question":"What is the purpose of red-teaming in your organization?",
+       "answer":" The purpose of red-teaming is to enhance safety and performance."
+    },
+    {
+       "question":"What is the purpose of the llama-recipes GitHub repo?",
+       "answer":" The purpose of the llama-recipes GitHub repo is to provide examples, demos, and guidance for using Llama models."
+    },
+    {
+       "question":"What is the purpose of Meta's Responsible Use Guide for developers using Llama ",
+       "answer":" The purpose of Meta's Responsible Use Guide is to provide guidance to developers on how to build products powered by LLMs in a responsible manner."
+    },
+    {
+       "question":"What should be defined to rate the results of the fine-tuned model?",
+       "answer":" A clear evaluation criteria."
+    },
+    {
+       "question":"What steps did the developers take to mitigate safety risks in their instruction-tuned Llama model?",
+       "answer":" The developers took the following steps to mitigate safety risks in their instruction-tuned Llama model: conducting extensive red teaming exercises, performing adversarial evaluations, and implementing safety mitigations techniques."
+    },
+    {
+       "question":"What behaviors are prohibited in the context of employment and economic benefits?",
+       "answer":" discrimination, other unlawful conduct, and harmful conduct"
+    },
+    {
+       "question":"Are there any fees or royalties required to use the Llama Materials under this license?",
+       "answer":" No, there are no fees or royalties required to use the Llama Materials under this license."
+    },
+    {
+       "question":"What is the precision in which LLM models can run without performance degradation using AWQ?",
+       "answer":" 4-bit"
+    },
+    {
+       "question":"What type of professional practices are not allowed without proper authorization or licensure?",
+       "answer":" Financial, legal, medical\/health, or related professional practices."
+    },
+    {
+       "question":"What is the F1 score of Llama Guard 2 when trained on the BeaverTails dataset?",
+       "answer":" 0.736"
+    },
+    {
+       "question":"What is the recommended step for developers before deploying applications of Llama ",
+       "answer":" Perform safety testing and tuning tailored to their specific applications of the model."
+    },
+    {
+       "question":"What is the license used for the Llama Guard model in the Purple Llama project?",
+       "answer":" Llama 2 Community License"
+    },
+    {
+       "question":"What is the first step in developing downstream models responsibly according to the updated guide?",
+       "answer":" Defining content policies and mitigations."
+    },
+    {
+       "question":"What data type is used for weights initialized from a normal distribution in 4-bit models?",
+       "answer":" NF4 (Normal Float 4)"
+    },
+    {
+       "question":"Where can I find examples of using Llama Guard in recipes?",
+       "answer":" https:\/\/github.com\/facebookresearch\/llama-recipes"
+    },
+    {
+       "question":"What is the recommended model-parallel value for the 70B model?",
+       "answer":" 8"
+    },
+    {
+       "question":"Where can you find more information about the Meta Llama 70B Model?",
+       "answer":" The model card,"
+    },
+    {
+       "question":"What percentage of the dataset typically makes up the test and validation sets when using a holdout method?",
+       "answer":" 10% - 30%,"
+    },
+    {
+       "question":"What are some hosting providers that support running Llama models?",
+       "answer":" OpenAI, Together AI, Anyscale, Replicate, Groq, etc."
+    },
+    {
+       "question":"According to the Llama Guard paper, why is it challenging to compare model performance across different models?",
+       "answer":" Because each model is built on its own policy and performs better on an evaluation dataset with a policy aligned to the model."
+    },
+    {
+       "question":"What is the advantage of having three partitions of data in the fine-tuning process?",
+       "answer":" The advantage is to get an unbiased evaluation of the model's performance."
+    },
+    {
+       "question":"What is included in the Llama 2 model download?",
+       "answer":" Model code, Model weights, README, Responsible Use Guide, License, Acceptable use policy, Model card, and Technical specifications."
+    },
+    {
+       "question":"What is the advantage of integrating with custom kernels?",
+       "answer":" The advantage of integrating with custom kernels is that it allows for support on specific devices."
+    },
+    {
+       "question":"What is the purpose of the GPTQ algorithm implemented in the AutoGPTQ library?",
+       "answer":" The purpose of the GPTQ algorithm is post-training quantization."
+    },
+    {
+       "question":"What advantage does AQLM take of when quantizing multiple weights together?",
+       "answer":" It takes advantage of interdependencies between the weights."
+    },
+    {
+       "question":"What is the primary advantage of using lower precision data in resource-constrained environments?",
+       "answer":" Faster inference and fine-tuning."
+    },
+    {
+       "question":"How can Meta Llama models be accessed on Microsoft Azure?",
+       "answer":" Meta Llama models can be accessed on Microsoft Azure through Models as a Service (MaaS) using Azure AI Studio and Model as a Platform (MaaP) using Azure Machine Learning Studio."
+    },
+    {
+       "question":"What is the purpose of aligning Llama Guard 2 with the Proof of Concept MLCommons taxonomy?",
+       "answer":" The purpose of aligning Llama Guard 2 with the Proof of Concept MLCommons taxonomy is to drive adoption of industry standards and facilitate collaboration and transparency in the LLM safety and content evaluation space."
+    },
+    {
+       "question":"What is the name of the repository that provides more examples of Llama recipes?",
+       "answer":" llama-recipes"
+    },
+    {
+       "question":"How will I receive the signed URL after my request is approved?",
+       "answer":" over email"
+    },
+    {
+       "question":"What is the purpose of the restriction on using Llama Materials?",
+       "answer":" To prevent the unauthorized use of Llama Materials to enhance competing language models."
+    },
+    {
+       "question":"What is the format of the prefix-suffix-middle method of infilling?",
+       "answer":" prefix-suffix-middle"
+    },
+
+    {
+       "question":"What is the license under which the Llama Guard model and its weights are released?",
+       "answer":" The license is the same as Llama 3, which can be found in the LICENSE file and is accompanied by the Acceptable Use Policy."
+    },
+    {
+       "question":"How do I download the 4-bit quantized Meta Llama 3 8B chat model using Ollama?",
+       "answer":" To download the 4-bit quantized Meta Llama 3 8B chat model using Ollama, run the command \"ollama pull llama3\" in your terminal."
+    },
+    {
+       "question":"How long are the download links for Llama valid for?",
+       "answer":" 24 hours"
+    },
+    {
+       "question":"What is the primary purpose of the suite of tools provided?",
+       "answer":" To support the AI lifecycle, specifically tuning models with enterprise data."
+    },
+    {
+       "question":"How does Llama Guard 2's classification performance compare to Llama Guard ",
+       "answer":" Llama Guard 2 has better classification performance than Llama Guard 1."
+    },
+    {
+       "question":"What data type is used for computations in Quantization Aware Training despite mimicking int8 values?",
+       "answer":" floating point numbers"
+    },
+    {
+       "question":"What is the purpose of providing specific examples in a prompt?",
+       "answer":" The purpose of providing specific examples in a prompt is to help the model better understand what kind of output is expected."
+    },
+    {
+        "question":"Why is Meta not sharing the training datasets for Llama?",
+        "answer":"We believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational fine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models have gone through Meta’s internal Privacy Review process to ensure responsible data usage in building our products. We are dedicated to the responsible and ethical development of our GenAI products, ensuring our policies reflect diverse contexts and meet evolving societal expectations."
+     },
+     {
+        "question":"Did Meta use human annotators to develop the data for Llama models?",
+        "answer":"Yes. There are more details, for example, about our use of human annotators in the Llama 2 research paper."
+     },
+     {
+        "question":"Can I use the output of the models to improve the Llama family of models, even though I cannot use them for other LLMs?",
+        "answer":"It's correct that the license restricts using any part of the Llama models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed."
+     },
+     {
+        "question":"What operating systems (OS) are officially supported if I want to use Llama model?",
+        "answer":"For the core Llama GitHub repos (Llama and Llama3) Linux is the only OS currently supported by this repo. Additional OS support is available through the Llama-Recipes repo."
+     },
+     {
+        "question":"Do Llama models provide traditional autoregressive text completion?",
+        "answer":"Llama models are auto-regressive language models, built on the transformer architecture. The core language models function by taking a sequence of words as input and predicting the next word, recursively generating text."
+     },
+     {
+        "question":"Do Llama models support logit biases as a request parameter to control token probabilities during sampling?",
+        "answer":"This is implementation dependent (i.e. the code used to run the model)."
+     },
+     {
+        "question":"Do Llama models support adjusting sampling temperature or top-p threshold via request parameters?",
+        "answer":"The model itself supports these parameters, but whether they are exposed or not depends on implementation."
+     },
+     {
+        "question":"What is llama-recipes?",
+        "answer":"The llama-recipes repository is a companion to the Meta Llama 3 models. The goal of this repository is to provide a scalable library for fine-tuning Meta Llama models, along with some example scripts and notebooks to quickly get started with using the models in a variety of use-cases, including fine-tuning for domain adaptation and building LLM-based applications with Meta Llama and other tools in the LLM ecosystem."
+     },
+     {
+        "question":"What is the difference on the tokenization techniques that Meta Llama 3 uses compare Llama 2?",
+        "answer":"Llama 2 uses SentencePiece for tokenization, whereas Llama 3 has transitioned to OpenAI’s Tiktoken."
+     },
+     {
+        "question":"How many tokens were used in Meta Llama 3 pretrain?",
+        "answer":"Meta Llama 3 is pretrained on over 15 trillion tokens that were all collected from publicly available sources."
+     },
+     {
+        "question":"How many tokens were used in  Llama 2 pretrain?",
+        "answer":"Llama 2 was pretrained on 2 trillion tokens of data from publicly available sources."
+     },
+     {
+        "question":"What is the name of the license agreement that Meta Llama 3 is under?",
+        "answer":"Meta LLAMA 3 COMMUNITY LICENSE AGREEMENT."
+     },
+     {
+        "question":"What is the name of the license agreement that Llama 2 is under?",
+        "answer":"LLAMA 2 COMMUNITY LICENSE AGREEMENT."
+     },
+     {
+        "question":"What is the context length of Llama 2 models?",
+        "answer":"Llama 2's context is 4k"
+     },
+     {
+        "question":"What is the context length of Meta Llama 3 models?",
+        "answer":"Meta Llama 3's context is 8k"
+     },
+     {
+        "question":"When is Llama 2 trained?",
+        "answer":"Llama 2 was trained between January 2023 and July 2023."
+     },
+     {
+        "question":"What is the name of the Llama 2 model that uses Grouped-Query Attention (GQA) ",
+        "answer":"Llama 2 70B"
+     },
+     {
+        "question":"What are the names of the Meta Llama 3 model that use Grouped-Query Attention (GQA) ",
+        "answer":"Meta Llama 3 8B and Meta Llama 3 70B"
+     }
+ ]
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py
new file mode 100644
index 000000000..c1bbfb458
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/format.py
@@ -0,0 +1,174 @@
+# file copied from https://github.com/ShishirPatil/gorilla/blob/main/raft/format.py
+from abc import ABC, abstractmethod
+import argparse
+from datasets import Dataset, load_dataset
+from typing import Dict, Literal, Any, get_args
+
+"""
+This file allows to convert raw HuggingFace Datasets into files suitable to fine tune completion and chat models.
+"""
+
+OutputDatasetType = Literal["parquet", "jsonl"]
+outputDatasetTypes = list(get_args(OutputDatasetType))
+
+InputDatasetType = Literal["arrow", "jsonl"]
+inputDatasetTypes = list(get_args(InputDatasetType))
+
+DatasetFormat = Literal["hf", "completion", "chat"]
+datasetFormats = list(get_args(DatasetFormat))
+
+def get_args() -> argparse.Namespace:
+    """
+    Parses and returns the arguments specified by the user's command
+    """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--input", type=str, required=True, help="Input HuggingFace dataset file")
+    parser.add_argument("--input-type", type=str, default="arrow", help="Format of the input dataset. Defaults to arrow.", choices=inputDatasetTypes)
+    parser.add_argument("--output", type=str, required=True, help="Output file")
+    parser.add_argument("--output-format", type=str, required=True, help="Format to convert the dataset to", choices=datasetFormats)
+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
+    parser.add_argument("--output-chat-system-prompt", type=str, help="The system prompt to use when the output format is chat")
+
+    args = parser.parse_args()
+    return args
+
+class DatasetFormatter(ABC):
+    """
+    Base class for dataset formatters. Formatters rename columns, remove and add 
+    columns to match the expected target format structure. HF, Chat or Completion models file formats.
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    @abstractmethod
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        pass
+
+class DatasetExporter(ABC):
+    """
+    Base class for dataset exporters. Exporters export dataset to different file types, JSONL, Parquet, ...
+    """
+    @abstractmethod
+    def export(self, ds: Dataset, output_path: str):
+        pass
+
+class DatasetConverter():
+    """
+    Entry point class. It resolves which DatasetFormatter and which DatasetExporter to use and runs them.
+    """
+    formats: Dict[DatasetFormat, DatasetFormatter]
+    exporters: Dict[OutputDatasetType, Any]
+
+    def __init__(self) -> None:
+        self.formats = {
+            "hf": HuggingFaceDatasetFormatter(),
+            "completion": OpenAiCompletionDatasetFormatter(),
+            "chat": OpenAiChatDatasetFormatter()
+        }
+        self.exporters = {
+            "parquet": ParquetDatasetExporter(),
+            "jsonl": JsonlDatasetExporter()
+        }
+
+    def convert(self, ds: Dataset, format: DatasetFormat, output_path: str, output_type: OutputDatasetType, params: Dict[str, str]):
+        if not format in self.formats:
+            raise Exception(f"Output Format {format} is not supported, pleased select one of {self.formats.keys()}")
+        
+        if not output_type in self.exporters:
+            raise Exception(f"Output Type {output_type} is not supported, pleased select one of {self.exporters.keys()}")
+
+        formatter = self.formats[format]
+        newds = formatter.format(ds, params)
+        exporter = self.exporters[output_type]
+        exporter.export(newds, output_path)
+
+class HuggingFaceDatasetFormatter(DatasetFormatter):
+    """
+    Returns the HuggingFace Dataset as is
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        return ds
+
+def _remove_all_columns_but(ds: Dataset, keep_columns) -> Dataset:
+    """
+    HF Dataset doesn't have a way to copy only specific columns of a Dataset so this help
+    removes all columns but the ones specified.
+    """
+    remove_columns = list(ds.column_names)
+    for keep in keep_columns:
+        remove_columns.remove(keep)
+    ds = ds.remove_columns(remove_columns)
+    return ds
+
+class OpenAiCompletionDatasetFormatter(DatasetFormatter):
+    """
+    Returns the Dataset in the OpenAI Completion Fine-tuning file format with two fields "prompt" and "completion".
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        newds = ds.rename_columns({'question': 'prompt', 'cot_answer': 'completion'})
+        return _remove_all_columns_but(newds, ['prompt', 'completion'])
+
+class OpenAiChatDatasetFormatter(OpenAiCompletionDatasetFormatter):
+    """
+    Returns the Dataset in the OpenAI Chat Fine-tuning file format with one field "messages".
+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
+    """
+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
+        newds = super().format(ds, params)
+
+        def format_messages(row):
+            messages = []
+            if 'system_prompt' in params:
+                system_prompt = params['system_prompt']
+                messages.append({ "role": "system", "content": system_prompt})
+            messages.extend([{ "role": "user", "content": row['prompt']}, { "role": "assistant", "content": row['completion']}])
+            chat_row = {"messages": messages}
+            return chat_row
+
+        newds = newds.map(format_messages)
+        return _remove_all_columns_but(newds, ['messages'])
+
+def append_extension(path: str, extension: str) -> str:
+    suffix = "." + extension
+    if not path.endswith(suffix):
+        path = path + suffix
+    return path
+
+
+class JsonlDatasetExporter(DatasetExporter):
+    """
+    Exports the Dataset to a JSONL file
+    """
+
+    def export(self, ds: Dataset, output_path: str):
+        ds.to_json(append_extension(output_path, "jsonl"))
+
+
+class ParquetDatasetExporter(DatasetExporter):
+    """
+    Exports the Dataset to a Parquet file
+    """
+
+    def export(self, ds: Dataset, output_path: str):
+        ds.to_parquet(append_extension(output_path, "parquet"))
+
+
+def main():
+    """
+    When raft.py is executed from the command line.
+    """
+    args = get_args()
+    ds = load_dataset(args.input_type, data_files={"train": args.input})['train']
+    formatter = DatasetConverter()
+
+    if args.output_chat_system_prompt and args.output_format != "chat":
+        raise Exception("Parameter --output-chat-system-prompt can only be used with --output-format chat")
+
+    format_params = {}
+    if args.output_chat_system_prompt:
+        format_params['system_prompt'] = args.output_chat_system_prompt
+
+    formatter.convert(ds=ds, format=args.output_format, output_path=args.output, output_type=args.output_type, params=format_params)
+
+if __name__ == "__main__":
+    main()
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png
new file mode 100644
index 000000000..e5d76e526
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Answers_Precision.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png
new file mode 100644
index 000000000..84027b0da
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/LLM_score_comparison.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png
new file mode 100644
index 000000000..a860e5e07
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/Num_of_refusal_comparison.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png
new file mode 100644
index 000000000..a2e56b561
Binary files /dev/null and b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/images/RAFT.png differ
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py
new file mode 100644
index 000000000..a216f09eb
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.py
@@ -0,0 +1,89 @@
+import logging
+import os
+import argparse
+from raft_utils import generate_questions, add_chunk_to_dataset
+from format import DatasetConverter, datasetFormats, outputDatasetTypes
+from config import load_config
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def main(api_config):
+    ds = None
+    try:
+        logging.info("Starting to generate question pair.")
+        # Generate questions as list for each chunk
+        chunk_questions_zip = generate_questions(api_config)
+        if not chunk_questions_zip:
+            logging.warning("No questions generated from text. Please check the api_config or model configuration.")
+            return
+        logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
+        ds = add_chunk_to_dataset(chunk_questions_zip,api_config)
+        ds.save_to_disk(args.output)
+        logging.info(f"Data successfully written to {api_config['output']}. Process completed.")
+        formatter = DatasetConverter()
+
+        # Extract format specific params
+        format_params = {}
+        formatter.convert(ds=ds, format=args.output_format, output_path=args.output+"raft", output_type=args.output_type, params=format_params)
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
+
+def parse_arguments():
+    # Define command line arguments for the script
+    parser = argparse.ArgumentParser(
+        description="Generate RAFT question/answer/context pairs from documentation."
+    )
+    parser.add_argument(
+        "-t", "--questions_per_chunk",
+        type=int,
+        default=4,
+        help="Specify the number of question pairs to generate per chunk."
+    )
+    parser.add_argument(
+        "-m", "--model",
+        default="meta-llama/Meta-Llama-3-70B-Instruct",
+        help="Select the model to use for generation."
+    )
+    parser.add_argument(
+        "-c", "--config_path",
+        default="./raft.yaml",
+        help="Set the configuration file path that has system prompt along with language, dataset path and number of questions."
+    )
+    parser.add_argument(
+        "-u", "--endpoint_url",
+        default="http://localhost:8001/v1",
+        type=str,
+        help="LLM API url for generating question/answer pairs."
+    )
+    parser.add_argument(
+        "-k", "--api_key",
+        default="EMPTY",
+        type=str,
+        help="LLM API key for generating question/answer pairs."
+    )
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The size of each chunk in number of tokens")
+    parser.add_argument("-o","--output", type=str, default="./output/", help="The path at which to save the dataset")
+    parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.", choices=datasetFormats)
+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    logging.info("Initializing the process and loading configuration...")
+    args = parse_arguments()
+
+    api_config = load_config(args.config_path)
+    api_config["questions_per_chunk"] = args.questions_per_chunk
+    api_config["model"] = args.model
+    api_config["chunk_size"] = args.chunk_size
+    api_config["endpoint_url"] = args.endpoint_url
+    api_config["output"] = args.output
+    api_config["api_key"] = args.api_key
+    # if OPENAI_API_KEY is defined in the system environment, use it as the API key
+    if os.environ.get('API_KEY') is not None:
+        api_config["api_key"] = os.environ["API_KEY"]
+    logging.info(f"Configuration loaded. Generating {args.questions_per_chunk} question per chunk using model '{args.model}'.")
+    logging.info(f"Chunk size: {args.chunk_size}.")
+    logging.info(f"num_distract_docs: {api_config['num_distract_docs']}, refusal_probability: {api_config['refusal_probability']}")
+    logging.info(f"Will use endpoint_url: {args.endpoint_url}.")
+    logging.info(f"Output will be written to {args.output}.")
+    main(api_config)
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml
new file mode 100644
index 000000000..b433a4a0f
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft.yaml
@@ -0,0 +1,51 @@
+COT_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question: {question}\nContext: {context}\n
+  Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+  - First provide step-by-step reasoning on how to answer the question.
+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+  You MUST begin your final answer with the tag "<ANSWER>:". <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+question_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
+  some topic(s), generate {num_questions} example questions a user could ask and would be answered
+  using information from the chunk. For example, if the given context was a Wikipedia
+  paragraph about the United States, an example question could be 'How many states are
+  in the United States?
+  Your questions should be formulated in the same style as questions that users could ask in a search engine.
+  This means that your questions MUST NOT mention something like "according to the passage" or "context".
+  The questions should be able to be answered in 60 words or less. Include only the questions in your response.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+# question_prompt_template: >
+#   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a language model skilled in creating quiz questions.
+#   You will be provided with a document,
+#   read it and please generate factoid question and answer pairs that are most likely be asked by a user of Llama language models
+#   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
+#   Your factoid questions should be answerable with a specific, concise piece of factual information from the context.
+#   Your factoid questions should be formulated in the same style as questions users could ask in a search engine.
+#   This means that your factoid questions MUST NOT mention something like "according to the passage" or "context".
+#   please make sure you follow those rules:
+#   1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to
+#   model, training, fine-tuning and evaluation details of Llama language models,
+#   2. The questions can be answered based *solely* on the given passage.
+#   3. Avoid asking questions with similar meaning.
+#   4. Never use any abbreviation.
+#   5. The questions should be able to be answered in 60 words or less. Include only the questions in your response. <|eot_id|>
+#   <|start_header_id|>user<|end_header_id|>
+#   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+data_dir: "./data"
+
+xml_path: ""
+
+chunk_size: 1000
+
+questions_per_chunk: 5
+
+num_distract_docs: 4 # number of distracting documents to add to each chunk
+
+refusal_probability: 0.05 # probability of related documents to be added to each chunk
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py
new file mode 100644
index 000000000..59dd649a6
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval.py
@@ -0,0 +1,336 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+import logging
+import evaluate
+import argparse
+from config import load_config
+import json
+from langchain_openai import ChatOpenAI
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores.utils import DistanceStrategy
+from datetime import datetime
+from langchain_community.document_loaders import DirectoryLoader
+import re
+import string
+import pandas as pd 
+
+
+def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
+        # Use langchain to load the documents from data directory
+    # Load the RAFT model
+
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        temperature=0.0,
+        max_tokens=1000
+        )
+
+    all_tasks = [api_config['eval_prompt_template'].format(question=question) for question in question_list]
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
+        return []
+    return clean_text_list(generated_answers)
+def format_docs_raft(docs):
+    context = ""
+    for doc in docs:
+        context += "\n<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
+    return context
+def build_retriever(api_config,embedding_model_name,retrieved_docs_num=5):
+    # Use langchain to load the documents from data directory
+    loader = DirectoryLoader(api_config['data_dir'])
+    docs = loader.load()
+    # Split the document into chunks with a specified chunk size
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"],chunk_overlap=int(api_config["chunk_size"] / 10),separators= ["----------","\n\n", "\n", " ", ""],strip_whitespace=True)
+    docs_processed = text_splitter.split_documents(docs)
+    # Remove duplicates
+    unique_texts = {}
+    docs_processed_unique = []
+    for doc in docs_processed:
+        if doc.page_content not in unique_texts:
+            unique_texts[doc.page_content] = True
+            docs_processed_unique.append(doc)
+    logging.info(f"Total number of docs_processed used by vectorstore: {len(docs_processed_unique)}")
+    # Store the document into a vector store with a specific embedding model
+    embedding_model = HuggingFaceEmbeddings(
+        model_name=embedding_model_name,
+        model_kwargs={"device": "cuda"},
+        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
+    )
+    vectorstore = FAISS.from_documents(docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE)
+    retriever = vectorstore.as_retriever(
+        search_kwargs={"k": retrieved_docs_num},
+    )
+    return retriever
+def generate_answers_with_RAG(model_name, question_list,api_config,retriever,api_url_overwrite=None):
+    api_url = api_config['model_endpoint_url']
+    if api_url_overwrite:
+        api_url = api_url_overwrite
+    key = api_config['api_key']
+    # Load the RAFT model
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        temperature=0.0,
+        max_tokens=1000
+        )
+    all_tasks = []
+    for q in question_list:
+        # retrive the top K documents
+        retrieved_docs = retriever.invoke(q)        
+        # format the documents into a string
+        documents = format_docs_raft(retrieved_docs)
+        # create a prompt
+        text = api_config["RAG_prompt_template"].format(context=documents,question=q)
+        all_tasks.append(text)
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
+        return []
+    return clean_text_list(generated_answers)
+def compute_rouge_score(generated : list, reference: list):
+    rouge_score = evaluate.load('rouge')
+    return rouge_score.compute(
+        predictions=generated,
+        references=reference,
+        use_stemmer=True,
+        use_aggregator=True
+    )
+def clean_text_list(text_list):
+    result = []
+    for text in text_list:
+        # for raft model, the answer will started with <ANSWER>
+        index = text.rfind("<ANSWER>")
+        if index!= -1:
+            text = text[index:]
+            text = text.replace("</ANSWER>:","")
+        text = text.replace("begin_quote","")
+        text = text.replace("end_quote","")
+        text = text.replace("##","")
+        text = text.strip()
+        result.append(text)
+    return result
+
+def normalize_answer(s):
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def exact_match_score(prediction, ground_truth):
+    """Computes EM score for a single prediction and ground truth answer."""
+    num_match = 0
+    assert len(prediction) == len(ground_truth), "Answer length does not match prediction length."
+    assert(len(ground_truth) > 0)
+    for idx, (pred,gold) in enumerate(zip(prediction, ground_truth)):
+        if (normalize_answer(pred) == normalize_answer(gold)):
+            num_match += 1
+    return num_match/len(ground_truth)
+def compute_judge_score(questions: list, generated : list, reference: list, api_config,api_url="http://localhost:8001/v1",key="EMPTY"):
+    correct_num = 0
+    model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=model_name,
+        max_tokens=1000,
+        temperature=0.0)
+    all_tasks = []
+    for question,prediction,gold in zip(questions, generated,reference):
+        message = api_config['judge_prompt_template'].format(question=question,prediction=prediction,gold=gold)
+        all_tasks.append(message)
+    judge_responses = llm.batch(all_tasks)
+    judge_responses = ["YES" in item.content for item in judge_responses]
+    correct_num = sum(judge_responses)
+    return correct_num/len(questions),judge_responses
+def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_llm_as_judge=True):
+    # set metric to default -1, means no metric is computed
+    metric = {
+        "Rouge_score": -1,
+        "LLM_judge_score": -1,
+        "Exact_match": -1
+    }
+    if run_rouge:
+        rouge_score = compute_rouge_score(generated,reference)
+        metric["Rouge_score"] = rouge_score
+        print("Rouge_score:",rouge_score)
+    if api_config["judge_endpoint_url"] and run_llm_as_judge:
+        api_url = api_config["judge_endpoint_url"]
+        LLM_judge_score,judge_responses = compute_judge_score(questions, generated, reference, api_config,api_url=api_url)
+        metric["LLM_judge_score"] = LLM_judge_score
+        metric["LLM_judge_responses"] = judge_responses
+        print(f"LLM_judge_score: {LLM_judge_score}")
+    if run_exact_match:
+        exact_match = exact_match_score(generated,reference)
+        print(f"Exact_match_percentage: {exact_match:.4f}")
+        metric["Exact_match"] = exact_match
+    return metric
+def main(api_config):
+    # Since the eval set is small, we can run the eval without async functions
+    try:
+        api_url = api_config["model_endpoint_url"]
+        logging.info("Starting to generate answer given the eval set.")
+        questions,groud_truth = [],[]
+        if api_config["eval_file"].endswith(".parquet"):
+            eval_file = pd.read_parquet(api_config["eval_file"],filters=[('source', '=', 'pt_discuss_forum')])
+            for index, item in eval_file.iterrows():
+                questions.append(item["question"]+"\nDetails:\n"+item["context"])
+                groud_truth.append(item["answer"])
+        else:
+            with open(api_config["eval_file"]) as fp:
+                eval_file = json.load(fp)
+                for index, item in enumerate(eval_file):
+                    questions.append(item["question"])
+                    groud_truth.append(item["answer"])
+        generated_answers = {}            
+        # build retriver
+        retriever = build_retriever(api_config,"sentence-transformers/multi-qa-mpnet-base-cos-v1",api_config["rag_topk"])
+        # Generate answers for 8B models
+        model_name = api_config["model_name"]
+        generated_answers[model_name] = generate_answers_model_only(model_name,questions,api_url)
+        generated_answers[model_name+"_RAG"] = generate_answers_with_RAG(model_name, questions,api_config,retriever)
+        print("Finished generating answers for ", model_name)
+        large_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
+        large_api_url = api_config["judge_endpoint_url"]
+        generated_answers["70B_Base"] = generate_answers_model_only(large_model_name,questions,large_api_url)
+        generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,retriever,large_api_url)
+        print("Finished generating answers for ", large_model_name)
+        logging.info(f"Successfully generated {len(generated_answers[model_name+'_RAG'])} answers for all models.")
+        # for generate answer from each model, compute the score metric
+        all_metrics = []
+        output_file = api_config["output_log"]+str(datetime.now().strftime("%Y%m%d_%H%M%S"))
+
+        for model_name,model_answer in generated_answers.items():
+            if len(model_answer) != len(groud_truth):
+                print(f"The length of {model_name} answer is not equal to the length of ground truth.")
+                continue
+            metric = score_single(api_config,model_answer,groud_truth,questions)
+            print(f"The eval result for {model_name} is: {metric}")
+            with open(output_file,"a") as fp:
+                fp.write(f"Eval_result for {model_name} \n")
+                fp.write(f"Rouge_score: {metric['Rouge_score']} \n")
+                fp.write(f"Exact_match_percentage: {metric['Exact_match']} \n")
+                judge_responses = ["None"] * len(questions)
+                if api_config["judge_endpoint_url"]:
+                    fp.write(f"LLM_judge_score: {metric['LLM_judge_score']} \n")
+                    judge_responses = metric["LLM_judge_responses"]
+                    all_metrics.append((model_name,metric['LLM_judge_score'],metric["LLM_judge_responses"]))
+                fp.write(f"QA details: \n")
+                for item in zip(questions,model_answer,groud_truth,judge_responses):
+                    fp.write(f"question: {item[0]} \n")
+                    fp.write(f"generated_answers: {item[1]} \n")
+                    fp.write(f"groud_truth: {item[2]} \n")
+                    fp.write(f"LLM_judge_response: {item[3]} \n")
+                    fp.write("\n")
+                fp.write("\n------------------------------------\n")
+        # Now we want to take a closer look at the questions that are not answered the same by all the models.
+        judge_zip = list(zip(*[item[-1] for item in all_metrics]))
+        model_names = [item[0] for item in all_metrics]
+        with open(output_file,"a") as fp:
+            for item in all_metrics:
+                fp.write(f"Model_Name: {item[0]}, LLM_SCORE: {item[1]} \n")
+            for idx,item in enumerate(judge_zip):
+                # if all the responses are "YES", then we skip this question
+                if sum(item) == len(item):
+                    continue 
+                else:
+                    fp.write(f"Comparing interested question: {questions[idx]} \n")
+                    fp.write(f"groud_truth: {groud_truth[idx]} \n")
+                    for i in range(len(model_names)):
+                        fp.write(f"{item[i]} {model_names[i]}_answers: {generated_answers[model_names[i]][idx]} \n")
+                    fp.write("------------------------\n")
+            fp.write(json.dumps(all_metrics))
+        print("Finished evaluating the model.")
+
+
+        logging.info(f"Eval successfully, the eval result is saved to {api_config['output_log']}.")
+        # Saving the eval result to a log file
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
+
+def parse_arguments():
+    # Define command line arguments for the script
+    parser = argparse.ArgumentParser(
+        description="Generate question/answer pairs from documentation."
+    )
+    parser.add_argument(
+        "-m", "--model_name",
+        default=None,
+        help="Provide the model_name to use for evaluation. If not specified, the model_path in eval_config.yaml will be used."
+    )
+    parser.add_argument(
+        "-c", "--config_path",
+        default="raft_eval_config.yaml",
+        help="Set the configuration file path that has system prompt along with language, evalset path."
+    )
+    parser.add_argument(
+        "-d", "--data_dir",
+        default=None,
+        help="Provide the data folder path to build RAG for evaluation. If not specified, the data_dir in eval_config.yaml will be used."
+    )
+    parser.add_argument(
+        "-u", "--model_endpoint_url",
+        default="http://localhost:8000/v1",
+        type=str,
+        help="The raft model endpoint url for eval."
+    )
+    parser.add_argument(
+        "-j", "--judge_endpoint_url",
+        default=None,
+        type=str,
+        help="The large model endpoint url for judge as LLM."
+    )
+    parser.add_argument(
+        "-o", "--output_log",
+        default="./eval_result",
+        help="save the eval result to a log file. Default is eval_result[timestamp].log"
+    )
+    parser.add_argument(
+        "-k", "--api_key",
+        default="EMPTY",
+        type=str,
+        help="LLM API key for generating question/answer pairs."
+    )
+    parser.add_argument(
+        "-r", "--rag_topk",
+        default=5,
+        type=int,
+        help="set the number of top k documents the RAG needs to retrive."
+    )
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The character size of each chunk used in RAG")
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    logging.info("Initializing the process and loading configuration...")
+    args = parse_arguments()
+    api_config = load_config(args.config_path)
+    api_config["model_endpoint_url"] = args.model_endpoint_url
+    if args.data_dir:
+        api_config["data_dir"] = args.data_dir
+    if args.model_name:
+        api_config["model_name"] = args.model_name
+    api_config["judge_endpoint_url"] = args.judge_endpoint_url
+    api_config["output_log"] = args.output_log
+    api_config["api_key"] = args.api_key
+    api_config["chunk_size"] = args.chunk_size
+    api_config["rag_topk"] = args.rag_topk
+    if api_config["judge_endpoint_url"]:
+        logging.info(f"The judge model url is: '{args.judge_endpoint_url}'.")
+    main(api_config)
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml
new file mode 100644
index 000000000..9cd5baa76
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_eval_config.yaml
@@ -0,0 +1,37 @@
+eval_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a AI assistant that skilled in answering questions related to Llama language models,
+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
+  Below is a question from a llama user, please the answer it with best of your knowledge,
+  The returned answer should be no more than 60 words. Please return the answers in text directly without any special tokens.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question:{question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+judge_prompt_template: >
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>You have been provided with a question, a teacher's answer and a student's answer below.
+    Given that question, you need to score the how good the student answer is compare to
+    the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
+    Here are the grade criterias to follow:
+    1. Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
+    2. Ensure that the student answer does not contain any conflicting statements.
+    3. It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.
+    YES means that the student's answer meets all of the criteria.
+    NO means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.
+    Only respond with "YES" or "NO", do not respond with anything else.<|eot_id|>
+    <|start_header_id|>user<|end_header_id|>
+    Question: {question} \n Teacher's Answer: {gold} \n Student's Answer: {prediction} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+RAG_prompt_template: >
+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
+  <|start_header_id|>user<|end_header_id|>
+  Question: {question}\nContext: {context}\n
+  Answer this question using the information given by multiple documents in the context above. Here are the things to pay attention to:
+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
+  - First provide step-by-step reasoning on how to answer the question.
+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+  You MUST begin your final answer with the tag "<ANSWER>:". <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+eval_file: "./eval_llama.json"
+
+model_name: "raft-8b"
+
+data_dir: "./data"
+
+rag_topk: 5
diff --git a/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py
new file mode 100644
index 000000000..73ae187b7
--- /dev/null
+++ b/recipes/use_cases/end2end-recipes/RAFT-Chatbot/raft_utils.py
@@ -0,0 +1,245 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import os
+import logging
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from datasets import Dataset
+import random
+from langchain_community.document_loaders import SitemapLoader,DirectoryLoader
+from bs4 import BeautifulSoup
+from langchain_openai import ChatOpenAI
+import copy
+
+
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def strip_str(s: str) -> str:
+    """
+    Helper function for helping format strings returned by GPT-4.
+    """
+    l, r = 0, len(s)-1
+    beg_found = False
+    for i in range(len(s)):
+        if s[i].isalpha():
+            if not beg_found:
+                l = i
+                beg_found = True
+            else:
+                r = i
+    r += 2
+    return s[l:min(r, len(s))]
+def clean_documents(raw_text):
+    all_lines = []
+    for line in raw_text.split("\n"):
+        line = line.strip()
+        if len(line.split()) == 0:
+            continue
+        else:
+            all_lines.append(line)
+    result = " ".join(all_lines)
+    return result
+def clean_text(content: BeautifulSoup) -> str:
+    # Find all 'nav' and 'header' elements in the BeautifulSoup object
+    nav_elements = content.find_all("nav")
+    header_elements = content.find_all("header")
+    mydivs = content.find_all("div", {"role": "list"})
+    # Remove each 'nav' and 'header' element from the BeautifulSoup object
+    for element in nav_elements + header_elements+mydivs:
+        element.decompose()
+    raw_text = content.get_text("\n")
+    return clean_documents(raw_text)
+# Read
+def read_file_content(xml_path: str, data_folder: str) -> str:
+    if xml_path and data_folder:
+        logging.info(f"Error: both xml_path and data_folder are provided, will only read from xml for now")
+    if not xml_path and not data_folder:
+        logging.info(f"Error: both xml_path and data_folder are not provided")
+        return ""
+    if xml_path:
+        if not os.path.exists(xml_path):
+            logging.info(f"Error: {xml_path} does not exist")
+            return ""
+        # Use langchain to load the documents from webpage links in the xml file
+        sitemap_loader = SitemapLoader(web_path=xml_path,is_local=True,parsing_function=clean_text)
+        sitemap_loader.requests_kwargs = {"verify": False}
+        docs = sitemap_loader.load()
+        return docs
+    elif len(data_folder) != 0:
+        if not os.path.exists(data_folder):
+            logging.info(f"Error: {data_folder} does not exist")
+            return ""
+        # Use langchain to load the documents from data folder
+        loader = DirectoryLoader(data_folder)
+        docs = loader.load()
+        return docs
+
+
+
+def get_chunks(
+    docs: list,
+    chunk_size: int = 1000,
+    api_config: dict = None,
+) -> list[str]:
+    """
+    Takes in a list of documents, breaks them down into chunks of size
+    `chunk_size`, and returns the chunks.
+    """
+    chunks = []
+    if  len(docs) == 0:
+        raise TypeError("Can not get chunks from empty text")
+    else:
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"],chunk_overlap=int(api_config["chunk_size"] / 10),separators= ["----------","\n\n", "\n", " "],strip_whitespace=True)
+        docs_processed = text_splitter.split_documents(docs)
+        logging.info(f"Total number of docs_processed: {len(docs_processed)}")
+        # Remove duplicates
+        unique_texts = {}
+        docs_processed_unique = []
+        for doc in docs_processed:
+            if doc.page_content not in unique_texts and len(doc.page_content) > 100 :
+                unique_texts[doc.page_content] = True
+                docs_processed_unique.append(doc)        
+        chunks = [chunk.page_content for chunk in docs_processed_unique]
+        logging.info(f"Total number of docs_processed_unique: {len(docs_processed_unique)}")
+    return chunks
+# read all the files in the data folder, then split them into chunks
+# generate questions for each chunk and return zip of chunk and related questions list
+def generate_questions(api_config):
+    # get documents from the data folder or xml file
+    api_url = api_config["endpoint_url"]
+    key = api_config["api_key"]
+    documents = read_file_content(api_config["xml_path"],api_config["data_dir"])
+    if len(documents) == 0:
+        logging.info(f"Error reading files, document_text is {len(documents)}")
+    document_batches = get_chunks(documents,api_config["chunk_size"],api_config)
+    # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
+    llm = ChatOpenAI(
+        openai_api_key=key,
+        openai_api_base=api_url,
+        model_name=api_config["model"],
+        temperature=0.0,
+        max_tokens=500
+        )
+    all_tasks = [api_config['question_prompt_template'].format(num_questions=str(api_config['questions_per_chunk']),context=document) for document in document_batches]
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    if len(generated_answers) == 0:
+        logging.error("No model answers generated. Please check the input context or model configuration in ",api_config["model"])
+        return []
+    final_result = []
+    for result in generated_answers:
+        queries = result.split('\n')
+        queries = [strip_str(q) for q in queries]
+        queries = [q for q in queries if any(c.isalpha() for c in q)]
+        if len(queries) > int(api_config['questions_per_chunk']):
+            # As the model may have unrelated question at the begining of the result
+            # if queries is more than questions_per_chunk, then we need to truncate it and only keep last questions_per_chunk lines
+            queries = queries[-int(api_config['questions_per_chunk']):]
+        final_result.append(queries)
+    return list(zip(document_batches,final_result))
+
+# Generate COT answer for each question given the chunk context
+def generate_COT(chunk_questions_zip,api_config) -> dict:
+    all_tasks = []
+    chunk_questions = []
+    question_asked = set()
+    for document_content,questions in chunk_questions_zip:
+        for question in questions:
+            question = question.strip()
+            # avoid asking the same question twice
+            if question not in question_asked:
+                question_asked.add(question)
+                prompt = api_config['COT_prompt_template'].format(question=question,context=str(document_content))
+                all_tasks.append(prompt)
+                chunk_questions.append((document_content,question))
+    # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
+    llm = ChatOpenAI(
+        openai_api_key=api_config["api_key"],
+        openai_api_base=api_config["endpoint_url"],
+        model_name=api_config["model"],
+        temperature=0.0,
+        max_tokens=500
+        )
+    generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
+    COT_results = []
+    # return a list of (chunk, question, generated_answer)
+    for (chunk, question),generated_answer in zip(chunk_questions,generated_answers):
+        COT_results.append((chunk,question,generated_answer))
+    return COT_results
+
+def add_chunk_to_dataset(
+    chunk_questions_zip: list,
+    api_config: dict,
+) -> None:
+    """
+    Given a chunk and related questions lists, create {Q, A, D} triplets and add them to the dataset.
+    """
+    num_distract = api_config["num_distract_docs"]
+    p = api_config["refusal_probability"]
+    chunks = [chunk for chunk, _ in chunk_questions_zip]
+    COT_results = generate_COT(chunk_questions_zip,api_config)
+    logging.info(f"COT generation completed, total num of COT results: {len(COT_results)}")
+    completed,refusal= 0,0
+    data_list = []
+    for chunk, q , cot in COT_results:
+        # The COT answer will be used as the label in the fine-tuning stage
+
+        datapt = {
+            "id": None,
+            "type": "general",
+            "question": q,
+            "context": None,
+            "oracle_context": None,
+            "cot_answer": cot
+        }
+        i = chunks.index(chunk)
+        datapt["id"] = f"seed_task_{len(data_list)}"
+        # add num_distract distractor docs
+        docs = [chunk]
+        indices = list(range(0, len(chunks)))
+        indices.remove(i)
+        for j in random.sample(indices, num_distract):
+            docs.append(chunks[j])
+        doc_copy = docs.copy()
+        random.shuffle(docs)
+        d = {
+            "title": [],
+            "sentences": []
+        }
+
+        d["title"].append(["placeholder_title"]*(num_distract+1))
+        d["sentences"].append(docs)
+        datapt["context"] = d
+        datapt["oracle_context"] = chunk
+
+        # construct model instruction
+        context = ""
+        for doc in docs:
+            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
+        context += q
+        # This instruction will be used in the fine-tuning stage
+        datapt["instruction"] = context
+        datapt_copy = copy.deepcopy(datapt)
+        # add to dataset
+        data_list.append(datapt)
+        # decides whether to add refusal example where the related documents are not provided
+        refusal = random.uniform(0, 1) <= p
+        if refusal:
+            doc_copy[0] = chunks[random.sample(indices, 1)[0]]
+            random.shuffle(doc_copy)
+            refusl_context = ""
+            for doc in doc_copy:
+                refusl_context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
+            refusl_context += q
+            # This instruction will be used in the fine-tuning stage
+            datapt_copy["id"] = f"refusal_task_{len(data_list)}"
+            datapt_copy["instruction"] = refusl_context
+            datapt_copy["cot_answer"] = "Sorry, I don't know the answer to this question because related documents are not found. Please try again."
+            data_list.append(datapt_copy)
+            refusal += 1
+        completed += 1
+        if completed % 100 == 0:
+            logging.info(f"refusal example added: {refusal}, total examples added: {completed}, total examples to be added: {len(COT_results)- completed}")
+    ds = Dataset.from_list(data_list)
+    return ds
diff --git a/requirements.txt b/requirements.txt
index f3cac001e..b27263c14 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,13 @@ chardet
 openai
 typing-extensions==4.8.0
 tabulate
-codeshield
+evaluate
+rouge_score
+pyyaml==6.0.1
+faiss-gpu
+unstructured[pdf]
+langchain_openai
+langchain
+langchain_community
+sentence_transformers
+codeshield
\ No newline at end of file
diff --git a/src/llama_recipes/configs/datasets.py b/src/llama_recipes/configs/datasets.py
index 05377bbed..139de223a 100644
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -3,28 +3,27 @@
 
 from dataclasses import dataclass
 
-    
+
 @dataclass
 class samsum_dataset:
     dataset: str =  "samsum_dataset"
     train_split: str = "train"
     test_split: str = "validation"
-    
-    
+
+
 @dataclass
 class grammar_dataset:
     dataset: str = "grammar_dataset"
-    train_split: str = "src/llama_recipes/datasets/grammar_dataset/gtrain_10k.csv" 
+    train_split: str = "src/llama_recipes/datasets/grammar_dataset/gtrain_10k.csv"
     test_split: str = "src/llama_recipes/datasets/grammar_dataset/grammar_validation.csv"
 
-    
+
 @dataclass
 class alpaca_dataset:
     dataset: str = "alpaca_dataset"
     train_split: str = "train"
     test_split: str = "val"
     data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
-    
 
 @dataclass
 class custom_dataset:
@@ -32,6 +31,7 @@ class custom_dataset:
     file: str = "recipes/quickstart/finetuning/datasets/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
+    data_path: str = ""
     
 @dataclass
 class llamaguard_toxicchat_dataset: