Skip to content

Commit

Permalink
Refactor 'crosspost_cmp()' to use better message content comparison f…
Browse files Browse the repository at this point in the history
…unction, split 'message_length_threshold' into same and cross-channel cases
  • Loading branch information
Mega-JC committed Jul 28, 2024
1 parent 748bdcf commit 2744f14
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy-to-vps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
SSH_PRIVATE_KEY: ${{ secrets.VPS_SSH_PRIVATE_KEY }}
REMOTE_HOST: ${{ secrets.VPS_HOST }}
REMOTE_USER: ${{ secrets.VPS_USER }}
TARGET: ${{ secrets.VPS_TARGET }}
TARGET: ${{ secrets.VPS_TARGET }}/
SCRIPT_AFTER: |
cp ~/config.py config.py
cp ~/.env .env
Expand Down
69 changes: 57 additions & 12 deletions pcbot/exts/anti_crosspost.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@
fetched_attachments: dict[int, bytes] = {}


def hamming_distance_padded(str1, str2):
# Pad the shorter string with spaces to match the lengths
max_len = max(len(str1), len(str2))
str1 = str1.ljust(max_len)
str2 = str2.ljust(max_len)

return sum(c1 != c2 for c1, c2 in zip(str1, str2))


def hamming_similarity_score(str1, str2):
distance = hamming_distance_padded(str1, str2)
max_len = max(len(str1), len(str2))
similarity_ratio = (max_len - distance) / max_len
return similarity_ratio


async def fetch_attachment(attachment: discord.Attachment, cache: bool = True) -> bytes:
if cache and attachment.id in fetched_attachments:
logger.debug(f"Fetched attachment from cache: {attachment.id}")
Expand Down Expand Up @@ -55,10 +71,7 @@ async def crosspost_cmp(message: discord.Message, other: discord.Message) -> boo
)

if have_content:
hamming_score = sum(
x != y for x, y in zip(message.content, other.content)
) / max(len(message.content), len(other.content))
similarity_score = min(max(0, 1 - hamming_score), 1)
similarity_score = hamming_similarity_score(message.content, other.content)
logger.debug(f"Computed similarity score for content: {similarity_score}")
else:
similarity_score = 0
Expand Down Expand Up @@ -115,7 +128,8 @@ def __init__(
bot: BotT,
channel_ids: Collection[int],
crosspost_timedelta_threshold: int,
message_length_threshold: int,
same_channel_message_length_threshold: int,
cross_channel_message_length_threshold: int,
max_tracked_users: int,
max_tracked_message_groups_per_user: int,
theme_color: int | discord.Color = 0,
Expand All @@ -127,7 +141,10 @@ def __init__(
bot (BotT): The bot instance.
channel_ids (Collection[int]): Collection of channel IDs to monitor.
crosspost_timedelta_threshold (int): Minimum time difference between messages to not be considered crossposts.
message_length_threshold (int): Minimum length of a text-only message to be considered.
same_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
if the messages are in the same channel.
cross_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
if the messages are in different channels.
max_tracked_users (int): Maximum number of users to track.
max_tracked_message_groups_per_user (int): Maximum number of message
groups to track per user.
Expand All @@ -140,7 +157,12 @@ def __init__(
self.crosspost_timedelta_threshold = crosspost_timedelta_threshold
self.max_tracked_users = max_tracked_users
self.max_tracked_message_groups_per_user = max_tracked_message_groups_per_user
self.message_length_threshold = message_length_threshold
self.same_channel_message_length_threshold = (
same_channel_message_length_threshold
)
self.cross_channel_message_length_threshold = (
cross_channel_message_length_threshold
)

@commands.Cog.listener()
async def on_message(self, message: discord.Message):
Expand All @@ -157,12 +179,18 @@ async def on_message(self, message: discord.Message):
or (
message.content
and not message.attachments
and len(message.content) < self.message_length_threshold
and (
len(message.content)
< min(
self.same_channel_message_length_threshold,
self.cross_channel_message_length_threshold,
)
)
)
):
return

logger.debug(f"Received message from {message.author.name}: {message.jump_url}")
logger.debug(f"Received noteworthy message from {message.author.name}: {message.jump_url}")

# Attempt to enforce the cache size limit
for user_id in list(self.crossposting_cache.keys()):
Expand All @@ -189,6 +217,18 @@ async def on_message(self, message: discord.Message):
for messages in user_cache["message_groups"]:
for existing_message in messages:
if (
message.channel.id == existing_message.channel.id
and len(message.content)
< self.same_channel_message_length_threshold
) or (
message.channel.id != existing_message.channel.id
and len(message.content)
< self.cross_channel_message_length_threshold
):
# enforce same-channel and cross-channel message length thresholds in order for them to be considered crossposts
continue

elif (
await crosspost_cmp(message, existing_message)
and message.created_at.timestamp()
- existing_message.created_at.timestamp()
Expand Down Expand Up @@ -320,7 +360,8 @@ async def setup(
max_tracked_users: int = 10,
max_tracked_message_groups_per_user: int = 10,
crosspost_timedelta_threshold: int = 86400,
message_length_threshold: int = 64,
same_channel_message_length_threshold: int = 64,
cross_channel_message_length_threshold: int = 16,
theme_color: int | discord.Color = 0,
):
"""
Expand All @@ -332,15 +373,19 @@ async def setup(
max_tracked_users (int): Maximum number of users to track.
max_tracked_message_groups_per_user (int): Maximum number of message groups to track per user.
crosspost_timedelta_threshold (int): Minimum time difference between messages to not be considered crossposts.
message_length_threshold (int): Minimum length of a text-only message to be considered.
same_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
if the messages are in the same channel.
cross_channel_message_length_threshold (int): Minimum length of a text-only message to be considered
if the messages are in different channels.
theme_color (int | discord.Color): Theme color for the bot's responses.
"""
await bot.add_cog(
AntiCrosspostCog(
bot,
channel_ids,
crosspost_timedelta_threshold,
message_length_threshold,
same_channel_message_length_threshold,
cross_channel_message_length_threshold,
max_tracked_users,
max_tracked_message_groups_per_user,
theme_color,
Expand Down

0 comments on commit 2744f14

Please sign in to comment.