diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d376238 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cSpell.words": [ + "Rutkowski" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b899cae --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ + +# Duplicate Guard + +**Duplicate Guard** is a lightweight GitHub Action designed to prevent duplicate files from being added or modified in a repository. This helps reduce repository bloat, minimize downloadable app sizes, and optimize asset management. Duplicate files can significantly increase the size of compressed artifacts (such as ZIP files) because they are not optimized against themselves during compression. This action ensures your repository remains clean and efficient by detecting and blocking redundant files. + +--- + +## 🚀 Features +- Detects and blocks unintentionally duplicated files in pull requests. +- Helps reduce downloadable app sizes by eliminating redundant assets. +- Supports .gitignore-like syntax to exclude specific files or directories. + +--- + +## 🛠️ Usage + +### 1. **Create an Ignore File** +Add a `duplicate_guard.ignore` file to the root of your repository to define patterns for files or directories to exclude from duplicate checks. The syntax follows `.gitignore` conventions. + +**Example `duplicate_guard.ignore`:** +```gitignore +test/* +logs/* +*.log +``` + +--- + +### 2. **Add the GitHub Action** +Create a GitHub Actions workflow in `.github/workflows/duplicate_guard.yml`: + +```yaml +name: Duplicate Guard +on: + pull_request: + branches: + - master + workflow_dispatch: + +jobs: + filesize_guard: + runs-on: ubuntu-latest + steps: + - name: Duplicate Guard + uses: chris-rutkowski/duplicate-guard@v1.0.0 +``` + +--- + +## ⚙️ Configuration + +### **Specify a Custom Ignore File Path** +If your `duplicate_guard.ignore` file is not in the root directory, specify its location using the `ignore_file` input: + +```yaml +steps: + - name: Duplicate Guard + uses: chris-rutkowski/filesize-guard@v1.0.0 + with: + ignore_file: ./my/path/my_filesize_guard.ignore +``` + +--- + +## 📄 License +This project is licensed under the [MIT License](LICENSE). diff --git a/action.yml b/action.yml new file mode 100644 index 0000000..f231766 --- /dev/null +++ b/action.yml @@ -0,0 +1,26 @@ +name: "Duplicate Guard" +description: "Blocks pull requests with unintentionally duplicated files" +author: "Chris Rutkowski" +inputs: + ignore_file: + description: "Path to the ignore file" + required: true + default: "./duplicate_guard.ignore" + +runs: + using: "composite" + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v45 + with: + separator: "," + + - name: Run Duplicate Guard + run: | + files="${{ steps.changed-files.outputs.added_files }},${{ steps.changed-files.outputs.modified_files }}" + python3 ${GITHUB_ACTION_PATH}/duplicate_guard.py ${{ inputs.ignore_file }} "$files" + shell: bash diff --git a/duplicate_guard.py b/duplicate_guard.py new file mode 100644 index 0000000..68d0ce9 --- /dev/null +++ b/duplicate_guard.py @@ -0,0 +1,66 @@ +import fnmatch +import hashlib +import os +import sys + +def load_ignore_patterns(ignore_file): + with open(ignore_file, "r") as f: + return [line.strip() for line in f if line.strip() and not line.startswith("#")] + +def should_ignore(file, patterns): + return any(fnmatch.fnmatch(file, pattern) for pattern in patterns) + +def calculate_checksum(file_path): + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + +def get_all_repository_files(ignore_patterns): + repo_files = [] + for root, _, files in os.walk("."): + for file in files: + file_path = os.path.join(root, file) + relative_path = os.path.relpath(file_path, ".") + if not should_ignore(relative_path, ignore_patterns): + repo_files.append(relative_path) + return repo_files + +ignore_file = sys.argv[1] +files = sys.argv[2].split(",") +ignore_patterns = load_ignore_patterns(ignore_file) + +# Step 1: Build a checksum map for all existing repository files +print("Calculating checksums for all repository files...") +checksums = {} +for file in get_all_repository_files(ignore_patterns): + checksum = calculate_checksum(file) + checksums[checksum] = file +print(f"Done, {len(checksums)} checksums") + +# Step 2: Check new/modified files against the repository and themselves +exit_code = 0 + +for file in files: + if not file or not os.path.isfile(file): + continue + + if should_ignore(file, ignore_patterns): + print(f"Ignoring: '{file}'") + continue + + print(f"Processing: '{file}'") + + checksum = calculate_checksum(file) + + if checksum in checksums: + if checksums[checksum] == file: + continue + + print(f"Error: '{file}' is a duplicate of '{checksums[checksum]}'") + exit_code = 1 + else: + checksums[checksum] = file + +sys.exit(exit_code)