initial commit

chris-rutkowski · Dec 24, 2024 · 42a66d7 · 42a66d7
1 parent 45dd257
commit 42a66d7
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 0 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "cSpell.words": [
+        "Rutkowski"
+    ]
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,66 @@
+
+# Duplicate Guard
+
+**Duplicate Guard** is a lightweight GitHub Action designed to prevent duplicate files from being added or modified in a repository. This helps reduce repository bloat, minimize downloadable app sizes, and optimize asset management. Duplicate files can significantly increase the size of compressed artifacts (such as ZIP files) because they are not optimized against themselves during compression. This action ensures your repository remains clean and efficient by detecting and blocking redundant files.
+
+---
+
+## 🚀 Features
+- Detects and blocks unintentionally duplicated files in pull requests.
+- Helps reduce downloadable app sizes by eliminating redundant assets.
+- Supports .gitignore-like syntax to exclude specific files or directories.
+
+---
+
+## 🛠️ Usage
+
+### 1. **Create an Ignore File**
+Add a `duplicate_guard.ignore` file to the root of your repository to define patterns for files or directories to exclude from duplicate checks. The syntax follows `.gitignore` conventions.
+
+**Example `duplicate_guard.ignore`:**
+```gitignore
+test/*
+logs/*
+*.log
+```
+
+---
+
+### 2. **Add the GitHub Action**
+Create a GitHub Actions workflow in `.github/workflows/duplicate_guard.yml`:
+
+```yaml
+name: Duplicate Guard
+on:
+  pull_request:
+    branches:
+      - master
+  workflow_dispatch:
+
+jobs:
+  filesize_guard:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Duplicate Guard
+        uses: chris-rutkowski/[email protected]
+```
+
+---
+
+## ⚙️ Configuration
+
+### **Specify a Custom Ignore File Path**
+If your `duplicate_guard.ignore` file is not in the root directory, specify its location using the `ignore_file` input:
+
+```yaml
+steps:
+  - name: Duplicate Guard
+    uses: chris-rutkowski/[email protected]
+      with:
+        ignore_file: ./my/path/my_filesize_guard.ignore
+```
+
+---
+
+## 📄 License
+This project is licensed under the [MIT License](LICENSE).
diff --git a/action.yml b/action.yml
@@ -0,0 +1,26 @@
+name: "Duplicate Guard"
+description: "Blocks pull requests with unintentionally duplicated files"
+author: "Chris Rutkowski"
+inputs:
+  ignore_file:
+    description: "Path to the ignore file"
+    required: true
+    default: "./duplicate_guard.ignore"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Get changed files
+      id: changed-files
+      uses: tj-actions/changed-files@v45
+      with:
+        separator: ","
+
+    - name: Run Duplicate Guard
+      run: |
+        files="${{ steps.changed-files.outputs.added_files }},${{ steps.changed-files.outputs.modified_files }}"
+        python3 ${GITHUB_ACTION_PATH}/duplicate_guard.py ${{ inputs.ignore_file }} "$files"
+      shell: bash
diff --git a/duplicate_guard.py b/duplicate_guard.py
@@ -0,0 +1,66 @@
+import fnmatch
+import hashlib
+import os
+import sys
+
+def load_ignore_patterns(ignore_file):
+    with open(ignore_file, "r") as f:
+        return [line.strip() for line in f if line.strip() and not line.startswith("#")]
+
+def should_ignore(file, patterns):
+    return any(fnmatch.fnmatch(file, pattern) for pattern in patterns)
+
+def calculate_checksum(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for byte_block in iter(lambda: f.read(4096), b""):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+def get_all_repository_files(ignore_patterns):
+    repo_files = []
+    for root, _, files in os.walk("."):
+        for file in files:
+            file_path = os.path.join(root, file)
+            relative_path = os.path.relpath(file_path, ".")
+            if not should_ignore(relative_path, ignore_patterns):
+                repo_files.append(relative_path)
+    return repo_files
+
+ignore_file = sys.argv[1]
+files = sys.argv[2].split(",")
+ignore_patterns = load_ignore_patterns(ignore_file)
+
+# Step 1: Build a checksum map for all existing repository files
+print("Calculating checksums for all repository files...")
+checksums = {}
+for file in get_all_repository_files(ignore_patterns):
+    checksum = calculate_checksum(file)
+    checksums[checksum] = file
+print(f"Done, {len(checksums)} checksums")
+
+# Step 2: Check new/modified files against the repository and themselves
+exit_code = 0
+
+for file in files:
+    if not file or not os.path.isfile(file):
+        continue
+
+    if should_ignore(file, ignore_patterns):
+        print(f"Ignoring: '{file}'")
+        continue
+
+    print(f"Processing: '{file}'")
+
+    checksum = calculate_checksum(file)
+
+    if checksum in checksums:
+        if checksums[checksum] == file:
+            continue
+
+        print(f"Error: '{file}' is a duplicate of '{checksums[checksum]}'")
+        exit_code = 1
+    else:
+        checksums[checksum] = file
+
+sys.exit(exit_code)