commit c7285532d22c39c7b5aac0ebf9253845598ee8c2 Author: Alexander Mahr Date: Sun Feb 2 18:37:23 2025 +0100 start repo for account-files diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..43b0c33 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/source/main.setup.db.sqlite.sql.h +sources.sqlite3.db +/result/account-files diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..17687f2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM debian:latest +RUN apt-get update -y && apt-get install -y build-essential +RUN apt-get install -y libxxhash-dev pkg-config libsqlite3-dev +RUN apt-get install -y zlib1g-dev +CMD ["make"] +WORKDIR "/source" + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9fa80b7 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# account-files - static binary that combines `find`,`xxh128sum` and `sqlite` + +## build instructions + +``` +./build.sh +``` diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..2bae88c --- /dev/null +++ b/build.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +TAG(){ + for CMD in xxh128sum sha1sum md5sum 'stat -c %Y' + do + RESULT="$($CMD Dockerfile 2>/dev/null)" && break + done + echo ${RESULT:0:16} +} + +IMAGENAME="build-sh--$(basename "$(dirname "$(realpath "$0")")")":"$(TAG)" + +test -z $(docker images -q "$IMAGENAME") && { +docker build --tag "$IMAGENAME" . +} +docker run --rm -it -v ./source:/source -v ./result:/result "$IMAGENAME" "$@" diff --git a/source/Makefile b/source/Makefile new file mode 100644 index 0000000..16d6fbe --- /dev/null +++ b/source/Makefile @@ -0,0 +1,33 @@ + +../result/account-files: account-files.c ../result + gcc -static "$<" $$(pkg-config --static --cflags --libs libxxhash sqlite3) -o "$@" + +../result: + mkdir ../result + +account-files.c: main.setup.db.sqlite.sql.h + touch "$@" + + + +# ALEXTODO: could be improved by keeping bytes like 'a-zA-Z0-9' etc. however would generate +# more complex code here. The total size increasse is about 1k and acceptable +# the header produced here stores the string ascaped via "\xXX\xXX...." for each input byte +# it strips however SQL comments to save some space +main.setup.db.sqlite.sql.h: main.setup.db.sqlite.sql + echo '#define MAIN_SETUP_DB_SQLITE_SQL "'"$$(grep -ve '^-- ' "$<" | od -A n -t x1 | tr -d '\n' | sed 's/ /\\x/g')"'"' > "$@" + +## this was the previous attempt, yet `printf '%q'` does not produce likely working output.... +## frankly when it comes to escaping there is no "standard" and all bash, c, js, python all +## have their own rules... +## printf '#define MAIN_SETUP_DB_SQLITE_SQL "%q"\n' "$$(grep -ve '^-- ' "$<" )" >"$@" +# +#convert.hierarchical.files.table.render.absolute.pathnames.by.hash: convert.hierarchical.files.table.render.absolute.pathnames.by.hash.sql +# time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@" +# +#convert.hierarchical.files.table.render.absolute.pathnames: convert.hierarchical.files.table.render.absolute.pathnames.sql +# time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@" +# +#convert.hierarchical.sources.table.render.absolute.pathnames: +# time cat convert.hierarchical.sources.table.render.absolute.pathnames.sql | sqlite3 sources.sqlite3.db | pv -l > convert.hierarchical.sources.table.render.absolute.pathnames + diff --git a/source/account-files.c b/source/account-files.c new file mode 100644 index 0000000..65411bd --- /dev/null +++ b/source/account-files.c @@ -0,0 +1,376 @@ +#include +#include +#include +#include +#include +#include +#include +// strlen +#include +// malloc +#include +#include +#include +#include "sqlite3.h" +#include "xxhash.h" +#include "main.setup.db.sqlite.sql.h" + + +// needed improvements +// 1. add more metadata file infos (i.e. file ownership and permissions to the database) +// (including dev, and inodes) in a sensible way +// 2. + + +time_t flush_last=0; +sqlite3 * db; +sqlite3_stmt * stmt; +sqlite3_stmt * stmt_insert_source; +sqlite3_stmt * stmt_insert_file; +sqlite3_stmt * stmt_select_source_id; +sqlite3_stmt * stmt_select_file_id; +sqlite3_stmt * stmt_update_file_hash; +sqlite_int64 timestamp; + +// only needed if we have more than one SQL command (i.e what is separated by ";" in +// in an input to sqlite3_prepare... function in which case the remaining stuff +// is stored in the "tail" +//const char * tail = 0; + +XXH128_hash_t hashFile(int fd) +{ + // Allocate a state struct. Do not just use malloc() or new. + XXH3_state_t* state = XXH3_createState(); + // Reset the state to start a new hashing session. + XXH3_128bits_reset(state); + char buffer[4096]; + size_t count; + // Read the file in chunks + while ((count = read(fd,buffer,sizeof(buffer))) != 0) { + // Run update() as many times as necessary to process the data + XXH3_128bits_update(state, buffer, count); + } + // Retrieve the finalized hash. This will not change the state. + XXH128_hash_t result = XXH3_128bits_digest(state); + // Free the state. Do not use free(). + XXH3_freeState(state); + return result; +} + + +// at present this function encompasses +// 1. the opening of the sqlite database +// 2. (in case "not exists") the creation of databases and indexes +// 3. the starting of a _transaction_ (for speed, and I reckon -to be tested- also race condition (i.e file locking)) +// +int sqlite_my_start(){ + + + if(SQLITE_OK != sqlite3_open("sources.sqlite3.db", &db)) { + return -1; + } + + int fd_initsql; + struct stat statbuf; + char* buffer_sqlite = NULL; + ssize_t readresult = 0; + size_t toread; + char* errmsg = NULL; + int sqlresult; + fd_initsql = open("main.setup.db.sqlite.sql",O_RDONLY); + if(fd_initsql!=-1){ + if( -1 == fstat(fd_initsql,&statbuf)){ + close(fd_initsql); + return -2; + } + buffer_sqlite = malloc(statbuf.st_size); + toread = statbuf.st_size; + if( buffer_sqlite == NULL) + { + close(fd_initsql); + return -3; + } + while(toread > 0){ + readresult = read(fd_initsql,buffer_sqlite+(statbuf.st_size-toread),toread); + if(readresult == -1){ + free(buffer_sqlite); + close(fd_initsql); + return -4; + } + toread -= readresult; + } + close(fd_initsql); + } + if(SQLITE_OK != sqlite3_exec(db,buffer_sqlite == NULL ? MAIN_SETUP_DB_SQLITE_SQL : buffer_sqlite, + NULL,NULL,&errmsg)){ + return -5; + } + if(buffer_sqlite!=NULL) + { + free(buffer_sqlite); + } +// +// +// // those execs appear to return SQLITE_OK (0) value. probably should hence be checked against that + if(SQLITE_OK != sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL)){ + return -6; + } + if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO sources (parent_id, " + "name, timestamp) VALUES (?,?,?)", -1, &stmt_insert_source, NULL)) { + return -7; + } + if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO files (source_id, " + "name, size, mtime, timestamp) VALUES (?,?,?,?,?)", -1, &stmt_insert_file, NULL)) { + return -8; + } + if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM sources WHERE parent_id = ? AND name = ?", + -1, &stmt_select_source_id, NULL)) { + return -9; + } + if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM files WHERE source_id = ? AND name = ? AND timestamp = ? ", + -1, &stmt_select_file_id, NULL)) { + return -10; + } + if(SQLITE_OK != sqlite3_prepare_v2(db, "UPDATE files SET hash = ? WHERE id = ?", + -1, &stmt_update_file_hash, NULL)) { + return -11; + } + + return 0; +} + +int sqlite_my_end(){ + sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL); + sqlite3_close(db); + return 0; +} + + + + + +int list(int dirfd, char* curdir, int length, sqlite3_int64 source_id){ + if(time(NULL)-flush_last > 10){ + flush_last=time(NULL); + sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL); + fprintf(stderr, " NEW TA\n"); + sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL); + } + char* next_curdirnext = NULL; + int next_length = 0; + int sqlite_result; + int filefd; + sqlite3_int64 next_source_id = -1; + sqlite3_int64 file_id = -1; +// putchar + + DIR* pdir = fdopendir(dirfd); + if(pdir==NULL){ + perror("fehler fopendir"); + return 1; + } + struct dirent * pdirent = NULL; + struct stat statbuf; + while(1) + { + errno = 0; + pdirent = readdir(pdir); + if(pdirent==NULL){ + if(errno==0) + { +// printf("end reached\n"); + closedir(pdir); + break; + } + else + { + fprintf(stderr," readdir errno = %d\n",errno); + return 1; + } + } + else + { + if(pdirent->d_name[0]=='.'){ + if(pdirent->d_name[1]==0){ + //printf("ignore SELF\n"); + continue; + } + if(pdirent->d_name[1]=='.'){ + if(pdirent->d_name[2]==0){ + // printf("ignore PARENT\n"); + continue; + } + } + } + // ALEXTODO: determine if openat + fstat is the better option than fstatat here + // reason: the fstat-at already works (within a folder - using fd file discriptor) + // reason: not all opened files need to be opened maybe... + if( 0== fstatat(dirfd,pdirent->d_name,&statbuf,AT_SYMLINK_NOFOLLOW) ) + { + printf("%s/%s\n",curdir,pdirent->d_name); + if(S_ISDIR(statbuf.st_mode)){ + //sqlite3_bind_null(stmt_insert, 1); + sqlite3_bind_int64(stmt_insert_source, 1,(sqlite3_int64) source_id);// pdirent->d_ino); + sqlite3_bind_text(stmt_insert_source, 2, pdirent->d_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmt_insert_source, 3,(sqlite3_int64) timestamp); + sqlite3_step(stmt_insert_source); + sqlite3_clear_bindings(stmt_insert_source); + sqlite3_reset(stmt_insert_source); + + + // printf("%s\n",pdirent->d_name); + int subdirfd = openat(dirfd,pdirent->d_name,O_RDONLY); + if(-1 == subdirfd) + { + perror("subdirfd"); + } else { + + sqlite3_bind_int64(stmt_select_source_id, 1,(sqlite3_int64) source_id);// pdirent->d_ino); + sqlite3_bind_text(stmt_select_source_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT); + sqlite_result = sqlite3_step(stmt_select_source_id); + + if(SQLITE_ROW == sqlite_result){ + next_source_id = sqlite3_column_int64(stmt_select_source_id, 0); + } else { + fprintf(stderr,"error getting source_id %d %s\n",sqlite_result,pdirent->d_name); + sqlite3_clear_bindings(stmt_select_source_id); + sqlite3_reset(stmt_select_source_id); + return -8; + } + sqlite3_clear_bindings(stmt_select_source_id); + sqlite3_reset(stmt_select_source_id); + next_length=(length+1+strlen(pdirent->d_name)); + next_curdirnext = malloc(next_length); + if(next_curdirnext==NULL) + { + perror("malloc"); + } + else + { + sprintf(next_curdirnext,"%s/%s",curdir,pdirent->d_name); + list(subdirfd,next_curdirnext,next_length,next_source_id); + free(next_curdirnext); + close(subdirfd); + } + } + } + else if(S_ISREG(statbuf.st_mode)){ + sqlite3_bind_int64(stmt_insert_file, 1,(sqlite3_int64) source_id); + sqlite3_bind_text(stmt_insert_file, 2, pdirent->d_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmt_insert_file, 3,(sqlite3_int64) statbuf.st_size); + sqlite3_bind_int64(stmt_insert_file, 4,(sqlite3_int64) statbuf.st_mtim.tv_sec); + sqlite3_bind_int64(stmt_insert_file, 5,(sqlite3_int64) timestamp); + sqlite3_step(stmt_insert_file); + sqlite3_clear_bindings(stmt_insert_file); + sqlite3_reset(stmt_insert_file); + + sqlite3_bind_int64(stmt_select_file_id, 1,(sqlite3_int64) source_id); + sqlite3_bind_text(stmt_select_file_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmt_select_file_id, 3,(sqlite3_int64) timestamp); + sqlite_result = sqlite3_step(stmt_select_file_id); + if(sqlite_result == SQLITE_ROW){ + // insertion merits hashing + file_id = sqlite3_column_int64(stmt_select_file_id, 0); + sqlite3_clear_bindings(stmt_select_file_id); + sqlite3_reset(stmt_select_file_id); + //fprintf(stderr,"file %s was inserted\n",pdirent->d_name); + filefd = openat(dirfd,pdirent->d_name,O_RDONLY); + if( filefd < 0){ + perror("open"); + + //ALEXTODO: handle more gracefully + continue; + } + XXH128_hash_t hash = hashFile(filefd); + close(filefd); +// fprintf(stderr,"%016llx%016llx %s sizeof=%d\n",(unsigned long long ) hash.high64,(unsigned long long)hash.low64,pdirent->d_name,sizeof(XXH128_hash_t)); + sqlite3_bind_int64(stmt_update_file_hash, 2,(sqlite3_int64) file_id); + sqlite3_bind_blob(stmt_update_file_hash,1,(const void*)&hash,sizeof(XXH128_hash_t),SQLITE_TRANSIENT); + sqlite3_step(stmt_update_file_hash); + sqlite3_clear_bindings(stmt_update_file_hash); + sqlite3_reset(stmt_update_file_hash); + + } else if(sqlite_result == SQLITE_DONE) { + // no insertion -> assumpotion is no change -> no hash + //fprintf(stderr,"file %s EXISTED\n",pdirent->d_name); + } else { + // general error +// fprintf(stderr,"file %s result was %d\n",sqlite_result); + sqlite3_clear_bindings(stmt_select_source_id); + sqlite3_reset(stmt_select_source_id); + return -9; + } +// sqlite3_clear_bindings(stmt_select_file_id); +// sqlite3_reset(stmt_select_file_id); + } + + //else + //{ + //// printf("%s/%s\n",curdir,pdirent->d_name); + //} + //struct stat { + // dev_t st_dev; /* ID of device containing file */ + // ino_t st_ino; /* Inode number */ + // mode_t st_mode; /* File type and mode */ + // nlink_t st_nlink; /* Number of hard links */ + // uid_t st_uid; /* User ID of owner */ + // gid_t st_gid; /* Group ID of owner */ + // dev_t st_rdev; /* Device ID (if special file) */ + // off_t st_size; /* Total size, in bytes */ + // blksize_t st_blksize; /* Block size for filesystem I/O */ + // blkcnt_t st_blocks; /* Number of 512 B blocks allocated */ + + // /* Since POSIX.1-2008, this structure supports nanosecond + // precision for the following timestamp fields. + // For the details before POSIX.1-2008, see VERSIONS. */ + + // struct timespec st_atim; /* Time of last access */ + // struct timespec st_mtim; /* Time of last modification */ + // struct timespec st_ctim; /* Time of last status change */ + + //#define st_atime st_atim.tv_sec /* Backward compatibility */ + //#define st_mtime st_mtim.tv_sec + //#define st_ctime st_ctim.tv_sec + //}; + } + else + { + fprintf(stderr,"fstatat errno = %d\n",errno); + return 2; + } + //printf("%d %s\n",pdirent->d_type ,pdirent->d_name); + } + } +// printf("dirfd = %d %p\n",dirfd,pdir); + return 0; +} + + +int main(int argc, char* argv[]){ +// DIR current; + int dirfd; + char* curdir= "."; + int result = 0; + timestamp = (sqlite_uint64) time(NULL); + result = sqlite_my_start(); + if(0!=result) + { + fprintf(stderr,"error sqlite_my_start() %d\n",result); + return -1; + } + puts("."); + dirfd = openat(AT_FDCWD,".",O_RDONLY); + if (dirfd > -1){ + list(dirfd,curdir,1+strlen(curdir),0); + } + result = sqlite_my_end(); + if(0!=result) + { + fprintf(stderr,"error sqlite_my_end() %d\n",result); + return -2; + } + + return 0; +} + + diff --git a/source/main.setup.db.sqlite.sql b/source/main.setup.db.sqlite.sql new file mode 100644 index 0000000..a111e8e --- /dev/null +++ b/source/main.setup.db.sqlite.sql @@ -0,0 +1,49 @@ +-- main.setup.db.sqlite.sql: +-- file that contains the sqlite SQL used to setup the database. +-- idea is to have the sqlite SQL in a separate ideally small file to +-- a) be able to tweak/see SQL database setup without necessarily force recompilation +-- b) change for syntax highlighting... +-- tradeoff challenges assumed are issues with: +-- a) split of individual SQL commands and +-- b) handling of comments +CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY, + parent_id INTEGER REFERENCES sources(id), + name TEXT, + timestamp INTEGER, + UNIQUE (parent_id,name) ON CONFLICT IGNORE +); +CREATE INDEX IF NOT EXISTS sources_index__timestamp ON sources(timestamp); +CREATE INDEX IF NOT EXISTS sources_index__parent_id ON sources(parent_id); +CREATE INDEX IF NOT EXISTS sources_index__name ON sources(name); +CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name); +--CREATE INDEX IF NOT EXISTS sources_index__parent_id__name__timestamp ON sources(parent_id,name,timestamp); +CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name); + +INSERT OR IGNORE INTO sources (id,parent_id,name,timestamp) VALUES (0, 0, '/', 0); +-- files +CREATE TABLE IF NOT EXISTS files( + id INTEGER PRIMARY KEY, + source_id INTEGER REFERENCES sources(id), + name TEXT, + size INTEGER, + mtime INTEGER, + timestamp INTEGER, + hash BLOB, + UNIQUE (source_id,name,size,mtime) ON CONFLICT IGNORE +); +CREATE INDEX IF NOT EXISTS files_index__timestamp ON files(timestamp); +CREATE INDEX IF NOT EXISTS files_index__hash ON files(hash); +-- for stmt_select_file_id +CREATE INDEX IF NOT EXISTS files_index__source_id__name__timestamp ON files(source_id,name,timestamp); +CREATE INDEX IF NOT EXISTS files_index__source_id__name__size__mtime ON files(source_id,name,size,mtime); + +-- hashes +-- CREATE TABLE IF NOT EXISTS hashes( +-- id INTEGER PRIMARY KEY, +-- hash BLOB, +-- timestamp INTEGER, +-- UNIQUE (hash) ON CONFLICT IGNORE +-- ); +-- CREATE INDEX IF NOT EXISTS hashes_index__timestamp ON hashes(timestamp); +-- CREATE INDEX IF NOT EXISTS hashes_index__hash ON hashes(hash);