Merge pull request #30 from generic-github-user/python

Maintenance merge of python/ap branch into mainline
generic-github-user · Aug 31, 2022 · 94f741b · 94f741b
2 parents c53c961 + e4519ed
commit 94f741b
Show file tree

Hide file tree

Showing 6 changed files with 608 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,20 @@
 *.temp
 ao_misc
+aoarchive
+
 todo/todo_backups/*
 todo/complete.*
+snapshot.todo
+
 ao_config.sh
+aorecur.sh
+
 ao_db.*
 ao_result.json
 aolog
+ao_db_backups
+db_block_*.json
+imgindex.json
+
+*.pickle
+config.py
diff --git a/ap/README.md b/ap/README.md
@@ -0,0 +1,79 @@
+# ap
+
+*ao-python*
+
+An alternative to the shell-based version of the ao codebase, which is
+beginning to show issues with maintainability and scalability. Rust, Lisp, and
+zx were also considered (among other languages), but Python's interoperability
+(particularly, the massive ecosystem of useful packages and libraries) and
+convenience made it the best choice for this next iteration. Another advantage
+is how seamlessly pickling/serialization integrate with Python's object model,
+permitting fluid development of customized classes that are easy for humans to
+interact with.
+
+The core philosophy is the same: track everything, always; and make things
+easier for the user (myself in particular), even at the expense of speed or
+mathematical elegance. This tool implements many functionalities (file
+tracking, bookmark management) that I had been planning to design standalone
+tools for; I still might in the future, but being tied to an established API
+inhibits rapid prototyping and generalization makes it more time-consuming to
+tailor functionality to my needs (though I still try to follow practices like
+DRY for the sake of maintainability and consistency as the codebase grows).
+
+At this point, the Python branch is my preferred tool and I'm unsure to what
+extent (if any) I will continue developing and/or maintaining the shell
+version. It should go without saying that this toolkit is tailored to my needs
+and some of the design choices may seem unconventional; regardless, I think the
+code is reasonably clean and comprehensible so you shouldn't run into very many
+issues modifying or extending it.
+
+## Features
+
+Essentially the same as ao/master; they generally fall into one of these
+categories:
+
+- File tracking and automated backups (and searching, archiving, alerts, etc.)
+- Ergonomic text-based todo lists
+- Bookmark and history management, web crawling/archival, information management, etc.
+
+One key difference is that this version is designed to allow productive
+interaction with the internal data via a REPL loop, a feature which is unwieldy
+in most other languages (or even in Python with more rigid data structures;
+graphs are great for generality and symbolic inference, but often very irksome
+to access by hand).
+
+## Installation
+
+Clone this repository, then execute the main script using Python (version 3.10
+recommended). The `-i` flag can be used to open a REPL for interactive use. If
+you plan on using the script frequently (via the shell), I'd recommend adding
+something like the following to your `.bashrc`/`.zshrc`/etc (the path will of
+course need to be adjusted).
+
+```
+alias ap='python ~/Desktop/ao/ap/ap.py'
+alias api='python -i ~/Desktop/ao/ap/ap.py'
+```
+
+## Usage
+
+TODO
+
+## Branches
+
+- `ap-docs`: documentation specific to ap
+- `ap-restructuring`: reorganizing the codebase for the Python version into distinct modules
+- `ap-todo`: updated todo list handling with fewer edge cases
+- `command-line`: CLI for `python`
+- `comments`: more thoroughly documenting the `python` branch and its associated branches (see also: `ap-docs`)
+- `docs`: documentation, comments, etc.
+- `file-processing`: alternate version of `files` corresponding to `python`; parent to `images` and other related branches
+- `files`: file tracking code (obsolete-ish)
+- `images`: sub-branch of `python`; extracting data from images using OCR and other methods
+- `master`: original (shell-based) version of ao
+- `notes`: note-taking and search functionality
+- `plotting`: helpful functions for visualizations, graphs, etc.
+- `python`: the Python-based version of ao (i.e., ap); also used as a parent branch for more specialized features
+- `system-monitoring`: recording historical information about processor usage, core temperature, available memory, etc.
+- `todo-handling`: scripts that manage todo lists
+- `web`: web crawling/indexing tools, bookmark management, etc.
diff --git a/ap/ap.py b/ap/ap.py
@@ -0,0 +1,178 @@
+# OS/system utilities and file interaction
+import os
+import sys
+import shutil
+from pathlib import Path
+import argparse
+
+import graphviz
+
+# Serialization and other miscellaneous tools
+import pickle
+import copy
+import time
+import itertools
+
+import re
+import string
+
+# Logging
+import warnings
+import textwrap
+
+# Monitoring
+import psutil
+import hashlib
+from PIL import Image
+from PIL import UnidentifiedImageError
+import pytesseract
+
+# Utilities and internal modules
+from utils import getsize, hashfile
+from files import filenode, snapshot, catalog
+
+# Configuration data
+from config import *
+
+parser = argparse.ArgumentParser()
+#parser.add_argument('subcommand', type=str)
+subparsers = parser.add_subparsers()
+
+imf_parser = subparsers.add_parser('imf')
+imf_parser.add_argument('text', type=str)
+imf_parser.set_defaults(func=lambda argvals: openfiles(list(filter(lambda x: hasattr(x, 'text') and argvals.text.lower() in x.text.lower(), data['files']))))
+
+#try:
+#    with open(dbpath, 'rb') as f:
+#        data = pickle.load(f)
+#except Exception as E:
+#    print(E)
+#    data = {
+#        'snapshots': [],
+#        'files': []
+#    }
+
+if len(sys.argv) > 1 and sys.argv[1] == 'CLEAR':
+    data = {
+        'snapshots': [],
+        'files': []
+    }
+
+# Store the current database at `path` (serialized using pickle; not very
+# efficient, but extremely expressive)
+def save(path=dbpath):
+    with open(path, 'wb') as f:
+        pickle.dump(data, f)
+
+# Logging helper function; displays the string `content`, indented and
+# line-wrapped
+def log(content, level=0):
+    n=60
+    lines = [content[i:i+n] for i in range(0, len(content), n)]
+    #print('  '*level+content)
+    #print(lines[0])
+    #print('\n~ '.join(lines))
+
+    p='  ' * level # prefix
+    print(p + f'\n{p}~ '.join(textwrap.wrap(content, n)))
+
+
+# Helper class to make my life easier;
+# - supports arbitrary keyword arguments (which are stored as attributes)
+# - tracks access and modification
+# - enables method chaining for common iterable operations
+class node:
+    def __init__(self, *args, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        created = time.time()
+
+
+# Generates a folder containing symlinks to each of the given files and opens
+# it using the associated program
+def openfiles(files):
+    dirname = f'/home/alex/Desktop/ap-temp-{time.time()}'
+    # be careful
+    fcopy = copy.deepcopy(files)
+    # If two or more files have the same name, rename (copies of) them with a
+    # numeric suffix
+    for f in fcopy:
+        if sum(f.name == g.name for g in fcopy) > 1:
+            # we're operating over references so we can modify the cloned nodes
+            # directly
+            for i, h in enumerate(list(filter(lambda x: f.name == x.name, fcopy))):
+                suffix = f'-{i+1}'
+                #a, b = 
+                h.name = str(Path(h.name).stem+suffix+h.ext)
+                # h.path += suffix
+
+    Path(dirname).mkdir()
+    # Generate the symlinks and open the folder
+    for f in fcopy:
+        Path(os.path.join(dirname, f.name)).symlink_to(f.path)
+    os.system(f'xdg-open {dirname}')
+
+# Add `tag` to `fnode` if its extension matches any of the types in `types`
+def tagfile(fnode, types, tag):
+    if (hasattr(fnode, 'ext') and
+        fnode.ext.lower()[1:] in types.split() and
+        'image' not in fnode.tags): # TODO
+        fnode.tags.append('image')
+
+# Apply some simple tagging rules to file nodes
+def tagfiles(n=0):
+    log('Tagging files')
+    for anode in itertools.islice(data['files'], n):
+        anode.validate()
+        # Rules (i.e., types -> tag)
+        tagfile(anode, 'gif png jpg jpeg tiff webm', 'image')
+        tagfile(anode, 'txt js py sh java css html todo', 'textlike')
+
+        # temporary-ish code to upgrade old file node instances
+        if not hasattr(anode, 'tags'): setattr(anode, 'tags', [])
+        anode.print()
+    save()
+
+def mayhave(obj, attr):
+    if hasattr(obj, attr):
+        return getattr(obj, attr)
+    else:
+        return None
+
+# Apply OCR to (up to `n`) files tagged with "image" and store the resulting
+# text in the filenode
+def extracttext(n=0):
+    log('Running OCR (Tesseract)')
+    for anode in itertools.islice(filter(
+            lambda x:
+                (hasattr(x, 'tags') and
+                'image' in x.tags and
+                not mayhave(x, 'processed')),
+            data['files']), n):
+        #anode.print()
+        try:
+            log('', 1)
+            log(anode.path, 1)
+            imgcontent = pytesseract.image_to_string(Image.open(anode.path))
+            setattr(anode, 'text', imgcontent)
+
+            # Display a slightly more readable version of the extracted content
+            text = imgcontent.replace('\n', '')
+            text = re.sub('[\n ]+', ' ', text, re.M)
+            log(f"Result (condensed): {text}", 1)
+        # Catch occasional invalid images
+        except UnidentifiedImageError as exception:
+            print(exception)
+        anode.processed = True
+    # Update database file
+    save()
+
+with open(dbpath, 'rb') as f:
+    data = pickle.load(f)
+
+args = parser.parse_args()
+#match args.subcommand:
+#    'imf'
+
+if hasattr(args, 'func'):
+    args.func(args)