commit c7285532d22c39c7b5aac0ebf9253845598ee8c2
Author: Alexander Mahr <info@alexmahr.de>
Date:   Sun Feb 2 18:37:23 2025 +0100

    start repo for account-files

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..43b0c33
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/source/main.setup.db.sqlite.sql.h
+sources.sqlite3.db
+/result/account-files
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..17687f2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,7 @@
+FROM debian:latest
+RUN apt-get update -y && apt-get install -y build-essential 
+RUN apt-get install -y libxxhash-dev pkg-config libsqlite3-dev 
+RUN apt-get install -y zlib1g-dev
+CMD ["make"]
+WORKDIR "/source"
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9fa80b7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# account-files - static binary that combines `find`,`xxh128sum` and `sqlite`
+
+## build instructions
+
+```
+./build.sh
+```
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..2bae88c
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+TAG(){
+  for CMD in xxh128sum sha1sum md5sum 'stat -c %Y'
+  do
+    RESULT="$($CMD Dockerfile 2>/dev/null)" && break
+  done
+  echo ${RESULT:0:16}
+}
+
+IMAGENAME="build-sh--$(basename "$(dirname "$(realpath "$0")")")":"$(TAG)"
+
+test -z $(docker images -q "$IMAGENAME") && {
+docker build --tag "$IMAGENAME" .
+}
+docker run --rm -it -v ./source:/source -v ./result:/result "$IMAGENAME" "$@"
diff --git a/source/Makefile b/source/Makefile
new file mode 100644
index 0000000..16d6fbe
--- /dev/null
+++ b/source/Makefile
@@ -0,0 +1,33 @@
+
+../result/account-files: account-files.c ../result
+	gcc -static "$<"  $$(pkg-config --static --cflags --libs libxxhash sqlite3)  -o "$@"
+
+../result:
+	mkdir ../result
+
+account-files.c: main.setup.db.sqlite.sql.h
+	touch "$@"
+
+
+
+# ALEXTODO: could be improved by keeping bytes like 'a-zA-Z0-9' etc. however would generate 
+#           more complex code here. The total size increasse is about 1k and acceptable
+# the header produced here stores the string ascaped via "\xXX\xXX...." for each input byte
+# it strips however SQL comments to save some space
+main.setup.db.sqlite.sql.h: main.setup.db.sqlite.sql
+	echo '#define MAIN_SETUP_DB_SQLITE_SQL "'"$$(grep -ve '^-- ' "$<" | od -A n -t x1 | tr -d '\n' | sed 's/ /\\x/g')"'"' > "$@"
+
+## this was the previous attempt, yet `printf '%q'` does not produce likely working output....
+## frankly when it comes to escaping there is no "standard" and all  bash, c, js, python all 
+## have their own rules... 
+##	printf '#define MAIN_SETUP_DB_SQLITE_SQL "%q"\n' "$$(grep -ve '^-- ' "$<" )" >"$@"
+#
+#convert.hierarchical.files.table.render.absolute.pathnames.by.hash: convert.hierarchical.files.table.render.absolute.pathnames.by.hash.sql
+#	time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@"
+#
+#convert.hierarchical.files.table.render.absolute.pathnames: convert.hierarchical.files.table.render.absolute.pathnames.sql
+#	time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@"
+#
+#convert.hierarchical.sources.table.render.absolute.pathnames:
+#	time cat convert.hierarchical.sources.table.render.absolute.pathnames.sql | sqlite3 sources.sqlite3.db | pv -l > convert.hierarchical.sources.table.render.absolute.pathnames
+	
diff --git a/source/account-files.c b/source/account-files.c
new file mode 100644
index 0000000..65411bd
--- /dev/null
+++ b/source/account-files.c
@@ -0,0 +1,376 @@
+#include <errno.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+// strlen
+#include <string.h>
+// malloc
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include "sqlite3.h"
+#include "xxhash.h"
+#include "main.setup.db.sqlite.sql.h"
+
+
+// needed improvements
+// 1. add more metadata file infos (i.e. file ownership and permissions to the database)
+//    (including dev, and inodes) in a sensible way
+// 2. 
+
+
+time_t flush_last=0;
+sqlite3 * db;
+sqlite3_stmt * stmt;
+sqlite3_stmt * stmt_insert_source;
+sqlite3_stmt * stmt_insert_file;
+sqlite3_stmt * stmt_select_source_id;
+sqlite3_stmt * stmt_select_file_id;
+sqlite3_stmt * stmt_update_file_hash;
+sqlite_int64 timestamp;
+
+// only needed if we have more than one SQL command (i.e what is separated by ";" in 
+// in an input to sqlite3_prepare... function in which case the remaining stuff
+// is stored in the "tail"
+//const char * tail = 0;
+
+XXH128_hash_t hashFile(int fd)
+{
+    // Allocate a state struct. Do not just use malloc() or new.
+    XXH3_state_t* state = XXH3_createState();
+    // Reset the state to start a new hashing session.
+    XXH3_128bits_reset(state);
+    char buffer[4096];
+    size_t count;
+    // Read the file in chunks
+    while ((count = read(fd,buffer,sizeof(buffer))) != 0) {
+        // Run update() as many times as necessary to process the data
+        XXH3_128bits_update(state, buffer, count);
+    }
+    // Retrieve the finalized hash. This will not change the state.
+    XXH128_hash_t result = XXH3_128bits_digest(state);
+    // Free the state. Do not use free().
+    XXH3_freeState(state);
+    return result;
+}
+
+
+// at present this function encompasses
+// 1. the opening of the sqlite database 
+// 2. (in case "not exists") the creation of databases and indexes
+// 3. the starting of a _transaction_ (for speed, and I reckon -to be tested- also race condition (i.e file locking))
+// 
+int sqlite_my_start(){
+
+
+    if(SQLITE_OK != sqlite3_open("sources.sqlite3.db", &db)) {
+        return -1;
+    }
+    
+    int fd_initsql;
+    struct stat statbuf;
+    char* buffer_sqlite = NULL;
+    ssize_t readresult = 0;
+    size_t toread;
+    char* errmsg = NULL;
+    int sqlresult;
+    fd_initsql = open("main.setup.db.sqlite.sql",O_RDONLY);
+    if(fd_initsql!=-1){
+        if( -1 == fstat(fd_initsql,&statbuf)){
+            close(fd_initsql);
+            return -2;
+        }
+        buffer_sqlite = malloc(statbuf.st_size);
+        toread = statbuf.st_size;
+        if( buffer_sqlite == NULL)
+        {
+            close(fd_initsql);
+            return -3;
+        }
+        while(toread > 0){
+            readresult = read(fd_initsql,buffer_sqlite+(statbuf.st_size-toread),toread);
+            if(readresult == -1){
+                free(buffer_sqlite);
+                close(fd_initsql);
+                return -4;
+            }
+            toread -= readresult; 
+        }
+        close(fd_initsql);
+    }
+    if(SQLITE_OK != sqlite3_exec(db,buffer_sqlite == NULL ? MAIN_SETUP_DB_SQLITE_SQL : buffer_sqlite,
+        NULL,NULL,&errmsg)){
+        return -5;
+    }
+    if(buffer_sqlite!=NULL)
+    {
+        free(buffer_sqlite);
+    }
+//
+//
+//    // those execs appear to return SQLITE_OK (0) value. probably should hence be checked against that
+    if(SQLITE_OK != sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL)){
+        return -6;
+    }
+    if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO sources (parent_id, "
+        "name,  timestamp) VALUES (?,?,?)", -1, &stmt_insert_source, NULL)) {
+        return -7;
+    }
+    if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO files (source_id, "
+        "name, size, mtime, timestamp) VALUES (?,?,?,?,?)", -1, &stmt_insert_file, NULL)) {
+        return -8;
+    }
+    if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM sources WHERE parent_id = ? AND name = ?",
+         -1, &stmt_select_source_id, NULL)) {
+        return -9;
+    }
+    if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM files WHERE source_id = ? AND name = ? AND timestamp = ? ",
+         -1, &stmt_select_file_id, NULL)) {
+        return -10;
+    }
+    if(SQLITE_OK != sqlite3_prepare_v2(db, "UPDATE files SET hash = ? WHERE id = ?",
+         -1, &stmt_update_file_hash, NULL)) {
+        return -11;
+    }
+
+    return 0;
+}
+
+int sqlite_my_end(){
+    sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL);
+    sqlite3_close(db);
+    return 0;
+}
+
+
+
+
+
+int list(int dirfd, char* curdir, int length, sqlite3_int64 source_id){
+    if(time(NULL)-flush_last > 10){
+        flush_last=time(NULL);
+        sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL);
+        fprintf(stderr, " NEW TA\n");
+        sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL);
+    }
+    char* next_curdirnext = NULL;
+    int next_length = 0;
+    int sqlite_result;
+    int filefd;
+    sqlite3_int64 next_source_id = -1;
+    sqlite3_int64 file_id = -1;
+//    putchar
+    
+    DIR* pdir = fdopendir(dirfd);
+    if(pdir==NULL){
+        perror("fehler fopendir");
+        return 1;
+    }   
+    struct dirent * pdirent = NULL;  
+    struct stat statbuf;
+    while(1)
+    {
+        errno = 0; 
+        pdirent = readdir(pdir);
+        if(pdirent==NULL){
+            if(errno==0)
+            {
+//                printf("end reached\n");
+                closedir(pdir);        
+                break;
+            }
+            else
+            {
+                fprintf(stderr," readdir errno = %d\n",errno);
+                return 1; 
+            }
+        }
+        else
+        {
+            if(pdirent->d_name[0]=='.'){
+                if(pdirent->d_name[1]==0){
+                    //printf("ignore SELF\n");
+                    continue;
+                }
+                if(pdirent->d_name[1]=='.'){
+                    if(pdirent->d_name[2]==0){
+                     //   printf("ignore PARENT\n");
+                        continue;
+                    }
+                }
+            }
+            // ALEXTODO: determine if openat + fstat is the better option than fstatat here
+            // reason: the fstat-at already works (within a folder - using fd file discriptor)
+            // reason: not all opened files need to be opened maybe...
+            if( 0== fstatat(dirfd,pdirent->d_name,&statbuf,AT_SYMLINK_NOFOLLOW) )
+            {
+                printf("%s/%s\n",curdir,pdirent->d_name);
+                if(S_ISDIR(statbuf.st_mode)){
+                    //sqlite3_bind_null(stmt_insert, 1);
+                    sqlite3_bind_int64(stmt_insert_source, 1,(sqlite3_int64) source_id);// pdirent->d_ino);
+                    sqlite3_bind_text(stmt_insert_source, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
+                    sqlite3_bind_int64(stmt_insert_source, 3,(sqlite3_int64) timestamp);
+                    sqlite3_step(stmt_insert_source);
+                    sqlite3_clear_bindings(stmt_insert_source);
+                    sqlite3_reset(stmt_insert_source);
+
+
+               //     printf("%s\n",pdirent->d_name);
+                    int subdirfd = openat(dirfd,pdirent->d_name,O_RDONLY); 
+                    if(-1 == subdirfd)
+                    {
+                        perror("subdirfd");
+                    } else {
+
+                        sqlite3_bind_int64(stmt_select_source_id, 1,(sqlite3_int64) source_id);// pdirent->d_ino);
+                        sqlite3_bind_text(stmt_select_source_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
+                        sqlite_result = sqlite3_step(stmt_select_source_id);
+
+                        if(SQLITE_ROW == sqlite_result){
+                            next_source_id = sqlite3_column_int64(stmt_select_source_id, 0);
+                        } else {
+                            fprintf(stderr,"error getting source_id %d %s\n",sqlite_result,pdirent->d_name);
+                            sqlite3_clear_bindings(stmt_select_source_id);
+                            sqlite3_reset(stmt_select_source_id);
+                            return -8;
+                        }
+                        sqlite3_clear_bindings(stmt_select_source_id);
+                        sqlite3_reset(stmt_select_source_id);
+                        next_length=(length+1+strlen(pdirent->d_name));
+                        next_curdirnext = malloc(next_length);
+                        if(next_curdirnext==NULL)
+                        {
+                            perror("malloc");
+                        } 
+                        else 
+                        {
+                            sprintf(next_curdirnext,"%s/%s",curdir,pdirent->d_name);
+                            list(subdirfd,next_curdirnext,next_length,next_source_id);
+                            free(next_curdirnext);
+                            close(subdirfd);
+                        }
+                    }
+                } 
+                else if(S_ISREG(statbuf.st_mode)){
+                    sqlite3_bind_int64(stmt_insert_file, 1,(sqlite3_int64) source_id);
+                    sqlite3_bind_text(stmt_insert_file,  2, pdirent->d_name, -1, SQLITE_TRANSIENT);
+                    sqlite3_bind_int64(stmt_insert_file, 3,(sqlite3_int64) statbuf.st_size);
+                    sqlite3_bind_int64(stmt_insert_file, 4,(sqlite3_int64) statbuf.st_mtim.tv_sec);
+                    sqlite3_bind_int64(stmt_insert_file, 5,(sqlite3_int64) timestamp);
+                    sqlite3_step(stmt_insert_file);
+                    sqlite3_clear_bindings(stmt_insert_file);
+                    sqlite3_reset(stmt_insert_file);
+
+                    sqlite3_bind_int64(stmt_select_file_id, 1,(sqlite3_int64) source_id);
+                    sqlite3_bind_text(stmt_select_file_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
+                    sqlite3_bind_int64(stmt_select_file_id, 3,(sqlite3_int64) timestamp);
+                    sqlite_result = sqlite3_step(stmt_select_file_id);
+                    if(sqlite_result == SQLITE_ROW){
+                        // insertion merits  hashing
+                        file_id = sqlite3_column_int64(stmt_select_file_id, 0);
+                        sqlite3_clear_bindings(stmt_select_file_id);
+                        sqlite3_reset(stmt_select_file_id);
+                        //fprintf(stderr,"file %s was inserted\n",pdirent->d_name);
+                        filefd = openat(dirfd,pdirent->d_name,O_RDONLY);
+                        if( filefd < 0){
+                            perror("open");
+                                
+                            //ALEXTODO: handle more gracefully
+                            continue;
+                        }
+                        XXH128_hash_t hash = hashFile(filefd);
+                        close(filefd);
+//                        fprintf(stderr,"%016llx%016llx  %s sizeof=%d\n",(unsigned long  long ) hash.high64,(unsigned long long)hash.low64,pdirent->d_name,sizeof(XXH128_hash_t));
+                        sqlite3_bind_int64(stmt_update_file_hash, 2,(sqlite3_int64) file_id);
+                        sqlite3_bind_blob(stmt_update_file_hash,1,(const void*)&hash,sizeof(XXH128_hash_t),SQLITE_TRANSIENT);
+                        sqlite3_step(stmt_update_file_hash);
+                        sqlite3_clear_bindings(stmt_update_file_hash);
+                        sqlite3_reset(stmt_update_file_hash);
+                        
+                    } else if(sqlite_result == SQLITE_DONE) {
+                        // no insertion -> assumpotion is no change -> no hash
+                        //fprintf(stderr,"file %s EXISTED\n",pdirent->d_name);
+                    } else {
+                        // general error
+//                        fprintf(stderr,"file %s result was %d\n",sqlite_result);
+                        sqlite3_clear_bindings(stmt_select_source_id);
+                        sqlite3_reset(stmt_select_source_id);
+                        return -9;
+                    }
+//                    sqlite3_clear_bindings(stmt_select_file_id);
+//                   sqlite3_reset(stmt_select_file_id);
+                }
+
+                //else
+                //{
+                ////    printf("%s/%s\n",curdir,pdirent->d_name);
+                //}
+                   //struct stat {
+                   //    dev_t      st_dev;      /* ID of device containing file */
+                   //    ino_t      st_ino;      /* Inode number */
+                   //    mode_t     st_mode;     /* File type and mode */
+                   //    nlink_t    st_nlink;    /* Number of hard links */
+                   //    uid_t      st_uid;      /* User ID of owner */
+                   //    gid_t      st_gid;      /* Group ID of owner */
+                   //    dev_t      st_rdev;     /* Device ID (if special file) */
+                   //    off_t      st_size;     /* Total size, in bytes */
+                   //    blksize_t  st_blksize;  /* Block size for filesystem I/O */
+                   //    blkcnt_t   st_blocks;   /* Number of 512 B blocks allocated */
+
+                   //    /* Since POSIX.1-2008, this structure supports nanosecond
+                   //       precision for the following timestamp fields.
+                   //       For the details before POSIX.1-2008, see VERSIONS. */
+
+                   //    struct timespec  st_atim;  /* Time of last access */
+                   //    struct timespec  st_mtim;  /* Time of last modification */
+                   //    struct timespec  st_ctim;  /* Time of last status change */
+
+                   //#define st_atime  st_atim.tv_sec  /* Backward compatibility */
+                   //#define st_mtime  st_mtim.tv_sec
+                   //#define st_ctime  st_ctim.tv_sec
+                   //}; 
+            }
+            else
+            {
+                fprintf(stderr,"fstatat errno = %d\n",errno);
+                return 2; 
+            }
+            //printf("%d %s\n",pdirent->d_type ,pdirent->d_name);
+        }
+    } 
+//    printf("dirfd = %d %p\n",dirfd,pdir);
+    return 0;
+}
+
+
+int main(int argc, char* argv[]){
+//    DIR current;
+    int dirfd;
+    char* curdir= "."; 
+    int result = 0;
+    timestamp = (sqlite_uint64) time(NULL);
+    result = sqlite_my_start();
+    if(0!=result)
+    {
+        fprintf(stderr,"error sqlite_my_start() %d\n",result);
+        return -1;
+    }
+    puts(".");
+    dirfd = openat(AT_FDCWD,".",O_RDONLY); 
+    if (dirfd > -1){
+        list(dirfd,curdir,1+strlen(curdir),0);
+    }
+    result = sqlite_my_end();
+    if(0!=result)
+    {
+        fprintf(stderr,"error sqlite_my_end() %d\n",result);
+        return -2;
+    }
+
+    return 0;
+}
+
+
diff --git a/source/main.setup.db.sqlite.sql b/source/main.setup.db.sqlite.sql
new file mode 100644
index 0000000..a111e8e
--- /dev/null
+++ b/source/main.setup.db.sqlite.sql
@@ -0,0 +1,49 @@
+-- main.setup.db.sqlite.sql:
+--  file that contains the sqlite SQL used to setup the database.
+--  idea is to have the sqlite SQL in a separate ideally small file to
+--   a) be able to tweak/see SQL database setup without necessarily force recompilation
+--   b) change for syntax highlighting...
+--  tradeoff challenges assumed are issues with:
+--   a) split of individual SQL commands and
+--   b) handling of comments
+CREATE TABLE IF NOT EXISTS sources (
+    id INTEGER PRIMARY KEY,
+    parent_id INTEGER REFERENCES sources(id),
+    name TEXT,
+    timestamp INTEGER, 
+    UNIQUE (parent_id,name) ON CONFLICT IGNORE
+);
+CREATE INDEX IF NOT EXISTS sources_index__timestamp ON sources(timestamp);
+CREATE INDEX IF NOT EXISTS sources_index__parent_id ON sources(parent_id);
+CREATE INDEX IF NOT EXISTS sources_index__name ON sources(name);
+CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name);
+--CREATE INDEX IF NOT EXISTS sources_index__parent_id__name__timestamp ON sources(parent_id,name,timestamp);
+CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name);
+
+INSERT OR IGNORE INTO sources (id,parent_id,name,timestamp) VALUES (0, 0, '/', 0);
+--  files
+CREATE TABLE IF NOT EXISTS files(
+    id INTEGER PRIMARY KEY,
+    source_id INTEGER REFERENCES sources(id),
+    name TEXT,
+    size INTEGER,
+    mtime INTEGER,
+    timestamp INTEGER, 
+    hash BLOB,
+    UNIQUE (source_id,name,size,mtime) ON CONFLICT IGNORE
+);
+CREATE INDEX IF NOT EXISTS files_index__timestamp ON files(timestamp);
+CREATE INDEX IF NOT EXISTS files_index__hash ON files(hash);
+-- for stmt_select_file_id 
+CREATE INDEX IF NOT EXISTS files_index__source_id__name__timestamp ON files(source_id,name,timestamp);
+CREATE INDEX IF NOT EXISTS files_index__source_id__name__size__mtime ON files(source_id,name,size,mtime);
+
+-- hashes
+-- CREATE TABLE IF NOT EXISTS hashes(
+--     id INTEGER PRIMARY KEY,
+--     hash BLOB,
+--     timestamp INTEGER, 
+--     UNIQUE (hash) ON CONFLICT IGNORE
+-- );
+-- CREATE INDEX IF NOT EXISTS hashes_index__timestamp ON hashes(timestamp);
+-- CREATE INDEX IF NOT EXISTS hashes_index__hash ON hashes(hash);