Skip to content

Commit

Permalink
add discord reader (run-llama#144)
Browse files Browse the repository at this point in the history

Co-authored-by: Jerry Liu <[email protected]>
  • Loading branch information
jerryjliu and Jerry Liu authored Dec 27, 2022
1 parent 5c96583 commit 8b1dba9
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 10 deletions.
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r data_requirements.txt
- name: Run linter
run: make lint

1 change: 1 addition & 0 deletions data_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
wikipedia
pymongo
slack_sdk
discord.py

# google
google-api-python-client
Expand Down
3 changes: 3 additions & 0 deletions docs/how_to/data_connectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ The API reference documentation can be found [here](/reference/readers.rst).
- [Notion](https://developers.notion.com/) (`NotionPageReader`)
- [Google Docs](https://developers.google.com/docs/api) (`GoogleDocsReader`)
- [Slack](https://api.slack.com/) (`SlackReader`)
- [Discord](https://discord.com/developers/docs/intro) (`DiscordReader`)
- Note: We use the [discord.py](https://github.com/Rapptz/discord.py) API wrapper for Discord. This is meant to be used
in an async setting; however, we adapt it to synchronous Document loading.
- Wikipedia (`WikipediaReader`)

#### Databases
Expand Down
103 changes: 103 additions & 0 deletions examples/data_connectors/DiscordDemo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "effeb5a7-8544-4ee4-8c11-bad0d8165394",
"metadata": {},
"source": [
"# Discord Demo\n",
"Demonstrates our Discord data connector"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "04edcd4a-5633-47ee-8a92-ff2f6abc2ec7",
"metadata": {},
"outputs": [],
"source": [
"# This is due to the fact that we use asyncio.loop_until_complete in\n",
"# the DiscordReader. Since the Jupyter kernel itself runs on\n",
"# an event loop, we need to add some help with nesting\n",
"!pip install nest_asyncio\n",
"import nest_asyncio\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ea1f66d-10ed-4417-bdcb-f8a894836ea5",
"metadata": {},
"outputs": [],
"source": [
"from gpt_index import GPTListIndex, DiscordReader\n",
"from IPython.display import Markdown, display\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da90589a-fb44-4ec6-9706-753dba4fa968",
"metadata": {},
"outputs": [],
"source": [
"discord_token = os.getenv(\"DISCORD_TOKEN\")\n",
"channel_ids = [1057178784895348746] # Replace with your channel_id\n",
"documents = DiscordReader(discord_token=discord_token).load_data(channel_ids=channel_ids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "341295df-2029-4728-ab3d-2ee178a7e6f1",
"metadata": {},
"outputs": [],
"source": [
"index = GPTListIndex(documents)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01c26b9d-49ec-4a6e-9c61-5c06bb86bbb2",
"metadata": {},
"outputs": [],
"source": [
"response = index.query(\"<query_text>\", verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f160c678-2fb5-4d6d-b2bc-87abb61cfdec",
"metadata": {},
"outputs": [],
"source": [
"display(Markdown(f\"<b>{response}</b>\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "gpt_retrieve_venv",
"language": "python",
"name": "gpt_retrieve_venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
26 changes: 16 additions & 10 deletions gpt_index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,19 @@
)

# readers
from gpt_index.readers.file import SimpleDirectoryReader
from gpt_index.readers.google.gdocs import GoogleDocsReader
from gpt_index.readers.mongo import SimpleMongoReader
from gpt_index.readers.notion import NotionPageReader

# allow importing Document at the top-level
from gpt_index.readers.schema.base import Document
from gpt_index.readers.slack import SlackReader
from gpt_index.readers.weaviate.reader import WeaviateReader
from gpt_index.readers.wikipedia import WikipediaReader
from gpt_index.readers import (
DiscordReader,
Document,
FaissReader,
GoogleDocsReader,
NotionPageReader,
PineconeReader,
SimpleDirectoryReader,
SimpleMongoReader,
SlackReader,
WeaviateReader,
WikipediaReader,
)

# token predictor
from gpt_index.token_predictor.mock_chain_wrapper import MockLLMPredictor
Expand Down Expand Up @@ -83,6 +86,9 @@
"GoogleDocsReader",
"SlackReader",
"WeaviateReader",
"FaissReader",
"PineconeReader",
"DiscordReader",
"LLMPredictor",
"MockLLMPredictor",
]
2 changes: 2 additions & 0 deletions gpt_index/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""

from gpt_index.readers.discord_reader import DiscordReader
from gpt_index.readers.faiss import FaissReader

# readers
Expand All @@ -29,6 +30,7 @@
"SimpleMongoReader",
"NotionPageReader",
"GoogleDocsReader",
"DiscordReader",
"SlackReader",
"WeaviateReader",
"PineconeReader",
Expand Down
153 changes: 153 additions & 0 deletions gpt_index/readers/discord_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""Discord reader.
Note: this file is named discord_reader.py to avoid conflicts with the
discord.py module.
"""

import asyncio
import logging
import os
from typing import Any, List, Optional

from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document

logger = logging.getLogger(__name__)


async def read_channel(
discord_token: str, channel_id: int, limit: Optional[int], oldest_first: bool
) -> str:
"""Async read channel.
Note: This is our hack to create a synchronous interface to the
async discord.py API. We use the `asyncio` module to run
this function with `asyncio.get_event_loop().run_until_complete`.
"""
import discord # noqa: F401

messages: List[discord.Message] = []

class CustomClient(discord.Client):
async def on_ready(self) -> None:
try:
print(f"{self.user} has connected to Discord!")
channel = client.get_channel(channel_id)
# only work for text channels for now
if not isinstance(channel, discord.TextChannel):
raise ValueError(
f"Channel {channel_id} is not a text channel. "
"Only text channels are supported for now."
)
# thread_dict maps thread_id to thread
thread_dict = {}
for thread in channel.threads:
thread_dict[thread.id] = thread

async for msg in channel.history(
limit=limit, oldest_first=oldest_first
):
messages.append(msg)
if msg.id in thread_dict:
thread = thread_dict[msg.id]
async for thread_msg in thread.history(
limit=limit, oldest_first=oldest_first
):
messages.append(thread_msg)
except Exception as e:
print("Encountered error: " + str(e))
finally:
await self.close()

intents = discord.Intents.default()
intents.message_content = True
client = CustomClient(intents=intents)
await client.start(discord_token)

msg_txt_list = [m.content for m in messages]

return "\n\n".join(msg_txt_list)


class DiscordReader(BaseReader):
"""Discord reader.
Reads conversations from channels.
Args:
discord_token (Optional[str]): Discord token. If not provided, we
assume the environment variable `DISCORD_TOKEN` is set.
"""

def __init__(self, discord_token: Optional[str] = None) -> None:
"""Initialize with parameters."""
try:
import discord # noqa: F401
except ImportError:
raise ValueError(
"`discord.py` package not found, please run `pip install discord.py`"
)
if discord_token is None:
discord_token = os.environ["DISCORD_TOKEN"]
if discord_token is None:
raise ValueError(
"Must specify `discord_token` or set environment "
"variable `DISCORD_TOKEN`."
)

self.discord_token = discord_token

def _read_channel(
self, channel_id: int, limit: Optional[int] = None, oldest_first: bool = True
) -> str:
"""Read channel."""
result = asyncio.get_event_loop().run_until_complete(
read_channel(
self.discord_token, channel_id, limit=limit, oldest_first=oldest_first
)
)
return result

def load_data(self, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory.
Args:
channel_ids (List[str]): List of channel ids to read.
limit (Optional[int]): Maximum number of messages to read.
oldest_first (bool): Whether to read oldest messages first.
Defaults to `True`.
Returns:
List[Document]: List of documents.
"""
channel_ids = load_kwargs.pop("channel_ids", None)
if channel_ids is None:
raise ValueError('Must specify a "channel_id" in `load_kwargs`.')
limit = load_kwargs.pop("limit", None)
oldest_first = load_kwargs.pop("oldest_first", True)

results = []
for channel_id in channel_ids:
if not isinstance(channel_id, int):
raise ValueError(
f"Channel id {channel_id} must be an integer, "
f"not {type(channel_id)}."
)
channel_content = self._read_channel(
channel_id, limit=limit, oldest_first=oldest_first
)
results.append(
Document(channel_content, extra_info={"channel": channel_id})
)
return results


if __name__ == "__main__":
reader = DiscordReader()
print("initialized reader")
output = reader.load_data(channel_ids=[1057178784895348746], limit=10)
print(output)

0 comments on commit 8b1dba9

Please sign in to comment.