add discord reader (run-llama#144)

Co-authored-by: Jerry Liu <[email protected]>
MaxJPRey · Dec 27, 2022 · 8b1dba9 · 8b1dba9
1 parent 5c96583
commit 8b1dba9
Show file tree

Hide file tree

Showing 7 changed files with 279 additions and 10 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -24,6 +24,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
+          pip install -r data_requirements.txt
       - name: Run linter
         run: make lint
 
diff --git a/data_requirements.txt b/data_requirements.txt
@@ -3,6 +3,7 @@
 wikipedia
 pymongo
 slack_sdk
+discord.py
 
 # google
 google-api-python-client

diff --git a/docs/how_to/data_connectors.md b/docs/how_to/data_connectors.md
@@ -7,6 +7,9 @@ The API reference documentation can be found [here](/reference/readers.rst).
 - [Notion](https://developers.notion.com/) (`NotionPageReader`)
 - [Google Docs](https://developers.google.com/docs/api) (`GoogleDocsReader`)
 - [Slack](https://api.slack.com/) (`SlackReader`)
+- [Discord](https://discord.com/developers/docs/intro) (`DiscordReader`)
+    - Note: We use the [discord.py](https://github.com/Rapptz/discord.py) API wrapper for Discord. This is meant to be used
+    in an async setting; however, we adapt it to synchronous Document loading.
 - Wikipedia (`WikipediaReader`)
 
 #### Databases

diff --git a/examples/data_connectors/DiscordDemo.ipynb b/examples/data_connectors/DiscordDemo.ipynb
@@ -0,0 +1,103 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "effeb5a7-8544-4ee4-8c11-bad0d8165394",
+   "metadata": {},
+   "source": [
+    "# Discord Demo\n",
+    "Demonstrates our Discord data connector"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04edcd4a-5633-47ee-8a92-ff2f6abc2ec7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is due to the fact that we use asyncio.loop_until_complete in\n",
+    "# the DiscordReader. Since the Jupyter kernel itself runs on\n",
+    "# an event loop, we need to add some help with nesting\n",
+    "!pip install nest_asyncio\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ea1f66d-10ed-4417-bdcb-f8a894836ea5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gpt_index import GPTListIndex, DiscordReader\n",
+    "from IPython.display import Markdown, display\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da90589a-fb44-4ec6-9706-753dba4fa968",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "discord_token = os.getenv(\"DISCORD_TOKEN\")\n",
+    "channel_ids = [1057178784895348746]  # Replace with your channel_id\n",
+    "documents = DiscordReader(discord_token=discord_token).load_data(channel_ids=channel_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "341295df-2029-4728-ab3d-2ee178a7e6f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = GPTListIndex(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01c26b9d-49ec-4a6e-9c61-5c06bb86bbb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = index.query(\"<query_text>\", verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f160c678-2fb5-4d6d-b2bc-87abb61cfdec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(Markdown(f\"<b>{response}</b>\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gpt_retrieve_venv",
+   "language": "python",
+   "name": "gpt_retrieve_venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gpt_index/__init__.py b/gpt_index/__init__.py
@@ -41,16 +41,19 @@
 )
 
 # readers
-from gpt_index.readers.file import SimpleDirectoryReader
-from gpt_index.readers.google.gdocs import GoogleDocsReader
-from gpt_index.readers.mongo import SimpleMongoReader
-from gpt_index.readers.notion import NotionPageReader
-
-# allow importing Document at the top-level
-from gpt_index.readers.schema.base import Document
-from gpt_index.readers.slack import SlackReader
-from gpt_index.readers.weaviate.reader import WeaviateReader
-from gpt_index.readers.wikipedia import WikipediaReader
+from gpt_index.readers import (
+    DiscordReader,
+    Document,
+    FaissReader,
+    GoogleDocsReader,
+    NotionPageReader,
+    PineconeReader,
+    SimpleDirectoryReader,
+    SimpleMongoReader,
+    SlackReader,
+    WeaviateReader,
+    WikipediaReader,
+)
 
 # token predictor
 from gpt_index.token_predictor.mock_chain_wrapper import MockLLMPredictor
@@ -83,6 +86,9 @@
     "GoogleDocsReader",
     "SlackReader",
     "WeaviateReader",
+    "FaissReader",
+    "PineconeReader",
+    "DiscordReader",
     "LLMPredictor",
     "MockLLMPredictor",
 ]
diff --git a/gpt_index/readers/__init__.py b/gpt_index/readers/__init__.py
@@ -10,6 +10,7 @@
 
 """
 
+from gpt_index.readers.discord_reader import DiscordReader
 from gpt_index.readers.faiss import FaissReader
 
 # readers
@@ -29,6 +30,7 @@
     "SimpleMongoReader",
     "NotionPageReader",
     "GoogleDocsReader",
+    "DiscordReader",
     "SlackReader",
     "WeaviateReader",
     "PineconeReader",

diff --git a/gpt_index/readers/discord_reader.py b/gpt_index/readers/discord_reader.py
@@ -0,0 +1,153 @@
+"""Discord reader.
+
+Note: this file is named discord_reader.py to avoid conflicts with the
+discord.py module.
+
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, List, Optional
+
+from gpt_index.readers.base import BaseReader
+from gpt_index.readers.schema.base import Document
+
+logger = logging.getLogger(__name__)
+
+
+async def read_channel(
+    discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool
+) -> str:
+    """Async read channel.
+
+    Note: This is our hack to create a synchronous interface to the
+    async discord.py API. We use the `asyncio` module to run
+    this function with `asyncio.get_event_loop().run_until_complete`.
+
+    """
+    import discord  # noqa: F401
+
+    messages: List[discord.Message] = []
+
+    class CustomClient(discord.Client):
+        async def on_ready(self) -> None:
+            try:
+                print(f"{self.user} has connected to Discord!")
+                channel = client.get_channel(channel_id)
+                # only work for text channels for now
+                if not isinstance(channel, discord.TextChannel):
+                    raise ValueError(
+                        f"Channel {channel_id} is not a text channel. "
+                        "Only text channels are supported for now."
+                    )
+                # thread_dict maps thread_id to thread
+                thread_dict = {}
+                for thread in channel.threads:
+                    thread_dict[thread.id] = thread
+
+                async for msg in channel.history(
+                    limit=limit, oldest_first=oldest_first
+                ):
+                    messages.append(msg)
+                    if msg.id in thread_dict:
+                        thread = thread_dict[msg.id]
+                        async for thread_msg in thread.history(
+                            limit=limit, oldest_first=oldest_first
+                        ):
+                            messages.append(thread_msg)
+            except Exception as e:
+                print("Encountered error: " + str(e))
+            finally:
+                await self.close()
+
+    intents = discord.Intents.default()
+    intents.message_content = True
+    client = CustomClient(intents=intents)
+    await client.start(discord_token)
+
+    msg_txt_list = [m.content for m in messages]
+
+    return "\n\n".join(msg_txt_list)
+
+
+class DiscordReader(BaseReader):
+    """Discord reader.
+
+    Reads conversations from channels.
+
+    Args:
+        discord_token (Optional[str]): Discord token. If not provided, we
+            assume the environment variable `DISCORD_TOKEN` is set.
+
+    """
+
+    def __init__(self, discord_token: Optional[str] = None) -> None:
+        """Initialize with parameters."""
+        try:
+            import discord  # noqa: F401
+        except ImportError:
+            raise ValueError(
+                "`discord.py` package not found, please run `pip install discord.py`"
+            )
+        if discord_token is None:
+            discord_token = os.environ["DISCORD_TOKEN"]
+            if discord_token is None:
+                raise ValueError(
+                    "Must specify `discord_token` or set environment "
+                    "variable `DISCORD_TOKEN`."
+                )
+
+        self.discord_token = discord_token
+
+    def _read_channel(
+        self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True
+    ) -> str:
+        """Read channel."""
+        result = asyncio.get_event_loop().run_until_complete(
+            read_channel(
+                self.discord_token, channel_id, limit=limit, oldest_first=oldest_first
+            )
+        )
+        return result
+
+    def load_data(self, **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory.
+
+        Args:
+            channel_ids (List[str]): List of channel ids to read.
+            limit (Optional[int]): Maximum number of messages to read.
+            oldest_first (bool): Whether to read oldest messages first.
+                Defaults to `True`.
+
+        Returns:
+            List[Document]: List of documents.
+
+        """
+        channel_ids = load_kwargs.pop("channel_ids", None)
+        if channel_ids is None:
+            raise ValueError('Must specify a "channel_id" in `load_kwargs`.')
+        limit = load_kwargs.pop("limit", None)
+        oldest_first = load_kwargs.pop("oldest_first", True)
+
+        results = []
+        for channel_id in channel_ids:
+            if not isinstance(channel_id, int):
+                raise ValueError(
+                    f"Channel id {channel_id} must be an integer, "
+                    f"not {type(channel_id)}."
+                )
+            channel_content = self._read_channel(
+                channel_id, limit=limit, oldest_first=oldest_first
+            )
+            results.append(
+                Document(channel_content, extra_info={"channel": channel_id})
+            )
+        return results
+
+
+if __name__ == "__main__":
+    reader = DiscordReader()
+    print("initialized reader")
+    output = reader.load_data(channel_ids=[1057178784895348746], limit=10)
+    print(output)