start repo for account-files

This commit is contained in:
Alexander Mahr 2025-02-02 18:37:23 +01:00
commit c7285532d2
7 changed files with 491 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/source/main.setup.db.sqlite.sql.h
sources.sqlite3.db
/result/account-files

7
Dockerfile Normal file
View file

@ -0,0 +1,7 @@
FROM debian:latest
RUN apt-get update -y && apt-get install -y build-essential
RUN apt-get install -y libxxhash-dev pkg-config libsqlite3-dev
RUN apt-get install -y zlib1g-dev
CMD ["make"]
WORKDIR "/source"

7
README.md Normal file
View file

@ -0,0 +1,7 @@
# account-files - static binary that combines `find`,`xxh128sum` and `sqlite`
## build instructions
```
./build.sh
```

16
build.sh Executable file
View file

@ -0,0 +1,16 @@
#!/bin/sh
TAG(){
for CMD in xxh128sum sha1sum md5sum 'stat -c %Y'
do
RESULT="$($CMD Dockerfile 2>/dev/null)" && break
done
echo ${RESULT:0:16}
}
IMAGENAME="build-sh--$(basename "$(dirname "$(realpath "$0")")")":"$(TAG)"
test -z $(docker images -q "$IMAGENAME") && {
docker build --tag "$IMAGENAME" .
}
docker run --rm -it -v ./source:/source -v ./result:/result "$IMAGENAME" "$@"

33
source/Makefile Normal file
View file

@ -0,0 +1,33 @@
../result/account-files: account-files.c ../result
gcc -static "$<" $$(pkg-config --static --cflags --libs libxxhash sqlite3) -o "$@"
../result:
mkdir ../result
account-files.c: main.setup.db.sqlite.sql.h
touch "$@"
# ALEXTODO: could be improved by keeping bytes like 'a-zA-Z0-9' etc. however would generate
# more complex code here. The total size increasse is about 1k and acceptable
# the header produced here stores the string ascaped via "\xXX\xXX...." for each input byte
# it strips however SQL comments to save some space
main.setup.db.sqlite.sql.h: main.setup.db.sqlite.sql
echo '#define MAIN_SETUP_DB_SQLITE_SQL "'"$$(grep -ve '^-- ' "$<" | od -A n -t x1 | tr -d '\n' | sed 's/ /\\x/g')"'"' > "$@"
## this was the previous attempt, yet `printf '%q'` does not produce likely working output....
## frankly when it comes to escaping there is no "standard" and all bash, c, js, python all
## have their own rules...
## printf '#define MAIN_SETUP_DB_SQLITE_SQL "%q"\n' "$$(grep -ve '^-- ' "$<" )" >"$@"
#
#convert.hierarchical.files.table.render.absolute.pathnames.by.hash: convert.hierarchical.files.table.render.absolute.pathnames.by.hash.sql
# time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@"
#
#convert.hierarchical.files.table.render.absolute.pathnames: convert.hierarchical.files.table.render.absolute.pathnames.sql
# time cat "$<" | sqlite3 sources.sqlite3.db | pv -l > "$@"
#
#convert.hierarchical.sources.table.render.absolute.pathnames:
# time cat convert.hierarchical.sources.table.render.absolute.pathnames.sql | sqlite3 sources.sqlite3.db | pv -l > convert.hierarchical.sources.table.render.absolute.pathnames

376
source/account-files.c Normal file
View file

@ -0,0 +1,376 @@
#include <errno.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
// strlen
#include <string.h>
// malloc
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include "sqlite3.h"
#include "xxhash.h"
#include "main.setup.db.sqlite.sql.h"
// needed improvements
// 1. add more metadata file infos (i.e. file ownership and permissions to the database)
// (including dev, and inodes) in a sensible way
// 2.
time_t flush_last=0;
sqlite3 * db;
sqlite3_stmt * stmt;
sqlite3_stmt * stmt_insert_source;
sqlite3_stmt * stmt_insert_file;
sqlite3_stmt * stmt_select_source_id;
sqlite3_stmt * stmt_select_file_id;
sqlite3_stmt * stmt_update_file_hash;
sqlite_int64 timestamp;
// only needed if we have more than one SQL command (i.e what is separated by ";" in
// in an input to sqlite3_prepare... function in which case the remaining stuff
// is stored in the "tail"
//const char * tail = 0;
XXH128_hash_t hashFile(int fd)
{
// Allocate a state struct. Do not just use malloc() or new.
XXH3_state_t* state = XXH3_createState();
// Reset the state to start a new hashing session.
XXH3_128bits_reset(state);
char buffer[4096];
size_t count;
// Read the file in chunks
while ((count = read(fd,buffer,sizeof(buffer))) != 0) {
// Run update() as many times as necessary to process the data
XXH3_128bits_update(state, buffer, count);
}
// Retrieve the finalized hash. This will not change the state.
XXH128_hash_t result = XXH3_128bits_digest(state);
// Free the state. Do not use free().
XXH3_freeState(state);
return result;
}
// at present this function encompasses
// 1. the opening of the sqlite database
// 2. (in case "not exists") the creation of databases and indexes
// 3. the starting of a _transaction_ (for speed, and I reckon -to be tested- also race condition (i.e file locking))
//
int sqlite_my_start(){
if(SQLITE_OK != sqlite3_open("sources.sqlite3.db", &db)) {
return -1;
}
int fd_initsql;
struct stat statbuf;
char* buffer_sqlite = NULL;
ssize_t readresult = 0;
size_t toread;
char* errmsg = NULL;
int sqlresult;
fd_initsql = open("main.setup.db.sqlite.sql",O_RDONLY);
if(fd_initsql!=-1){
if( -1 == fstat(fd_initsql,&statbuf)){
close(fd_initsql);
return -2;
}
buffer_sqlite = malloc(statbuf.st_size);
toread = statbuf.st_size;
if( buffer_sqlite == NULL)
{
close(fd_initsql);
return -3;
}
while(toread > 0){
readresult = read(fd_initsql,buffer_sqlite+(statbuf.st_size-toread),toread);
if(readresult == -1){
free(buffer_sqlite);
close(fd_initsql);
return -4;
}
toread -= readresult;
}
close(fd_initsql);
}
if(SQLITE_OK != sqlite3_exec(db,buffer_sqlite == NULL ? MAIN_SETUP_DB_SQLITE_SQL : buffer_sqlite,
NULL,NULL,&errmsg)){
return -5;
}
if(buffer_sqlite!=NULL)
{
free(buffer_sqlite);
}
//
//
// // those execs appear to return SQLITE_OK (0) value. probably should hence be checked against that
if(SQLITE_OK != sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL)){
return -6;
}
if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO sources (parent_id, "
"name, timestamp) VALUES (?,?,?)", -1, &stmt_insert_source, NULL)) {
return -7;
}
if(SQLITE_OK != sqlite3_prepare_v2(db, "INSERT OR IGNORE INTO files (source_id, "
"name, size, mtime, timestamp) VALUES (?,?,?,?,?)", -1, &stmt_insert_file, NULL)) {
return -8;
}
if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM sources WHERE parent_id = ? AND name = ?",
-1, &stmt_select_source_id, NULL)) {
return -9;
}
if(SQLITE_OK != sqlite3_prepare_v2(db, "SELECT id FROM files WHERE source_id = ? AND name = ? AND timestamp = ? ",
-1, &stmt_select_file_id, NULL)) {
return -10;
}
if(SQLITE_OK != sqlite3_prepare_v2(db, "UPDATE files SET hash = ? WHERE id = ?",
-1, &stmt_update_file_hash, NULL)) {
return -11;
}
return 0;
}
int sqlite_my_end(){
sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL);
sqlite3_close(db);
return 0;
}
int list(int dirfd, char* curdir, int length, sqlite3_int64 source_id){
if(time(NULL)-flush_last > 10){
flush_last=time(NULL);
sqlite3_exec(db, "END TRANSACTION", NULL, NULL, NULL);
fprintf(stderr, " NEW TA\n");
sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL);
}
char* next_curdirnext = NULL;
int next_length = 0;
int sqlite_result;
int filefd;
sqlite3_int64 next_source_id = -1;
sqlite3_int64 file_id = -1;
// putchar
DIR* pdir = fdopendir(dirfd);
if(pdir==NULL){
perror("fehler fopendir");
return 1;
}
struct dirent * pdirent = NULL;
struct stat statbuf;
while(1)
{
errno = 0;
pdirent = readdir(pdir);
if(pdirent==NULL){
if(errno==0)
{
// printf("end reached\n");
closedir(pdir);
break;
}
else
{
fprintf(stderr," readdir errno = %d\n",errno);
return 1;
}
}
else
{
if(pdirent->d_name[0]=='.'){
if(pdirent->d_name[1]==0){
//printf("ignore SELF\n");
continue;
}
if(pdirent->d_name[1]=='.'){
if(pdirent->d_name[2]==0){
// printf("ignore PARENT\n");
continue;
}
}
}
// ALEXTODO: determine if openat + fstat is the better option than fstatat here
// reason: the fstat-at already works (within a folder - using fd file discriptor)
// reason: not all opened files need to be opened maybe...
if( 0== fstatat(dirfd,pdirent->d_name,&statbuf,AT_SYMLINK_NOFOLLOW) )
{
printf("%s/%s\n",curdir,pdirent->d_name);
if(S_ISDIR(statbuf.st_mode)){
//sqlite3_bind_null(stmt_insert, 1);
sqlite3_bind_int64(stmt_insert_source, 1,(sqlite3_int64) source_id);// pdirent->d_ino);
sqlite3_bind_text(stmt_insert_source, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
sqlite3_bind_int64(stmt_insert_source, 3,(sqlite3_int64) timestamp);
sqlite3_step(stmt_insert_source);
sqlite3_clear_bindings(stmt_insert_source);
sqlite3_reset(stmt_insert_source);
// printf("%s\n",pdirent->d_name);
int subdirfd = openat(dirfd,pdirent->d_name,O_RDONLY);
if(-1 == subdirfd)
{
perror("subdirfd");
} else {
sqlite3_bind_int64(stmt_select_source_id, 1,(sqlite3_int64) source_id);// pdirent->d_ino);
sqlite3_bind_text(stmt_select_source_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
sqlite_result = sqlite3_step(stmt_select_source_id);
if(SQLITE_ROW == sqlite_result){
next_source_id = sqlite3_column_int64(stmt_select_source_id, 0);
} else {
fprintf(stderr,"error getting source_id %d %s\n",sqlite_result,pdirent->d_name);
sqlite3_clear_bindings(stmt_select_source_id);
sqlite3_reset(stmt_select_source_id);
return -8;
}
sqlite3_clear_bindings(stmt_select_source_id);
sqlite3_reset(stmt_select_source_id);
next_length=(length+1+strlen(pdirent->d_name));
next_curdirnext = malloc(next_length);
if(next_curdirnext==NULL)
{
perror("malloc");
}
else
{
sprintf(next_curdirnext,"%s/%s",curdir,pdirent->d_name);
list(subdirfd,next_curdirnext,next_length,next_source_id);
free(next_curdirnext);
close(subdirfd);
}
}
}
else if(S_ISREG(statbuf.st_mode)){
sqlite3_bind_int64(stmt_insert_file, 1,(sqlite3_int64) source_id);
sqlite3_bind_text(stmt_insert_file, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
sqlite3_bind_int64(stmt_insert_file, 3,(sqlite3_int64) statbuf.st_size);
sqlite3_bind_int64(stmt_insert_file, 4,(sqlite3_int64) statbuf.st_mtim.tv_sec);
sqlite3_bind_int64(stmt_insert_file, 5,(sqlite3_int64) timestamp);
sqlite3_step(stmt_insert_file);
sqlite3_clear_bindings(stmt_insert_file);
sqlite3_reset(stmt_insert_file);
sqlite3_bind_int64(stmt_select_file_id, 1,(sqlite3_int64) source_id);
sqlite3_bind_text(stmt_select_file_id, 2, pdirent->d_name, -1, SQLITE_TRANSIENT);
sqlite3_bind_int64(stmt_select_file_id, 3,(sqlite3_int64) timestamp);
sqlite_result = sqlite3_step(stmt_select_file_id);
if(sqlite_result == SQLITE_ROW){
// insertion merits hashing
file_id = sqlite3_column_int64(stmt_select_file_id, 0);
sqlite3_clear_bindings(stmt_select_file_id);
sqlite3_reset(stmt_select_file_id);
//fprintf(stderr,"file %s was inserted\n",pdirent->d_name);
filefd = openat(dirfd,pdirent->d_name,O_RDONLY);
if( filefd < 0){
perror("open");
//ALEXTODO: handle more gracefully
continue;
}
XXH128_hash_t hash = hashFile(filefd);
close(filefd);
// fprintf(stderr,"%016llx%016llx %s sizeof=%d\n",(unsigned long long ) hash.high64,(unsigned long long)hash.low64,pdirent->d_name,sizeof(XXH128_hash_t));
sqlite3_bind_int64(stmt_update_file_hash, 2,(sqlite3_int64) file_id);
sqlite3_bind_blob(stmt_update_file_hash,1,(const void*)&hash,sizeof(XXH128_hash_t),SQLITE_TRANSIENT);
sqlite3_step(stmt_update_file_hash);
sqlite3_clear_bindings(stmt_update_file_hash);
sqlite3_reset(stmt_update_file_hash);
} else if(sqlite_result == SQLITE_DONE) {
// no insertion -> assumpotion is no change -> no hash
//fprintf(stderr,"file %s EXISTED\n",pdirent->d_name);
} else {
// general error
// fprintf(stderr,"file %s result was %d\n",sqlite_result);
sqlite3_clear_bindings(stmt_select_source_id);
sqlite3_reset(stmt_select_source_id);
return -9;
}
// sqlite3_clear_bindings(stmt_select_file_id);
// sqlite3_reset(stmt_select_file_id);
}
//else
//{
//// printf("%s/%s\n",curdir,pdirent->d_name);
//}
//struct stat {
// dev_t st_dev; /* ID of device containing file */
// ino_t st_ino; /* Inode number */
// mode_t st_mode; /* File type and mode */
// nlink_t st_nlink; /* Number of hard links */
// uid_t st_uid; /* User ID of owner */
// gid_t st_gid; /* Group ID of owner */
// dev_t st_rdev; /* Device ID (if special file) */
// off_t st_size; /* Total size, in bytes */
// blksize_t st_blksize; /* Block size for filesystem I/O */
// blkcnt_t st_blocks; /* Number of 512 B blocks allocated */
// /* Since POSIX.1-2008, this structure supports nanosecond
// precision for the following timestamp fields.
// For the details before POSIX.1-2008, see VERSIONS. */
// struct timespec st_atim; /* Time of last access */
// struct timespec st_mtim; /* Time of last modification */
// struct timespec st_ctim; /* Time of last status change */
//#define st_atime st_atim.tv_sec /* Backward compatibility */
//#define st_mtime st_mtim.tv_sec
//#define st_ctime st_ctim.tv_sec
//};
}
else
{
fprintf(stderr,"fstatat errno = %d\n",errno);
return 2;
}
//printf("%d %s\n",pdirent->d_type ,pdirent->d_name);
}
}
// printf("dirfd = %d %p\n",dirfd,pdir);
return 0;
}
int main(int argc, char* argv[]){
// DIR current;
int dirfd;
char* curdir= ".";
int result = 0;
timestamp = (sqlite_uint64) time(NULL);
result = sqlite_my_start();
if(0!=result)
{
fprintf(stderr,"error sqlite_my_start() %d\n",result);
return -1;
}
puts(".");
dirfd = openat(AT_FDCWD,".",O_RDONLY);
if (dirfd > -1){
list(dirfd,curdir,1+strlen(curdir),0);
}
result = sqlite_my_end();
if(0!=result)
{
fprintf(stderr,"error sqlite_my_end() %d\n",result);
return -2;
}
return 0;
}

View file

@ -0,0 +1,49 @@
-- main.setup.db.sqlite.sql:
-- file that contains the sqlite SQL used to setup the database.
-- idea is to have the sqlite SQL in a separate ideally small file to
-- a) be able to tweak/see SQL database setup without necessarily force recompilation
-- b) change for syntax highlighting...
-- tradeoff challenges assumed are issues with:
-- a) split of individual SQL commands and
-- b) handling of comments
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY,
parent_id INTEGER REFERENCES sources(id),
name TEXT,
timestamp INTEGER,
UNIQUE (parent_id,name) ON CONFLICT IGNORE
);
CREATE INDEX IF NOT EXISTS sources_index__timestamp ON sources(timestamp);
CREATE INDEX IF NOT EXISTS sources_index__parent_id ON sources(parent_id);
CREATE INDEX IF NOT EXISTS sources_index__name ON sources(name);
CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name);
--CREATE INDEX IF NOT EXISTS sources_index__parent_id__name__timestamp ON sources(parent_id,name,timestamp);
CREATE INDEX IF NOT EXISTS sources_index__parent_id__name ON sources(parent_id,name);
INSERT OR IGNORE INTO sources (id,parent_id,name,timestamp) VALUES (0, 0, '/', 0);
-- files
CREATE TABLE IF NOT EXISTS files(
id INTEGER PRIMARY KEY,
source_id INTEGER REFERENCES sources(id),
name TEXT,
size INTEGER,
mtime INTEGER,
timestamp INTEGER,
hash BLOB,
UNIQUE (source_id,name,size,mtime) ON CONFLICT IGNORE
);
CREATE INDEX IF NOT EXISTS files_index__timestamp ON files(timestamp);
CREATE INDEX IF NOT EXISTS files_index__hash ON files(hash);
-- for stmt_select_file_id
CREATE INDEX IF NOT EXISTS files_index__source_id__name__timestamp ON files(source_id,name,timestamp);
CREATE INDEX IF NOT EXISTS files_index__source_id__name__size__mtime ON files(source_id,name,size,mtime);
-- hashes
-- CREATE TABLE IF NOT EXISTS hashes(
-- id INTEGER PRIMARY KEY,
-- hash BLOB,
-- timestamp INTEGER,
-- UNIQUE (hash) ON CONFLICT IGNORE
-- );
-- CREATE INDEX IF NOT EXISTS hashes_index__timestamp ON hashes(timestamp);
-- CREATE INDEX IF NOT EXISTS hashes_index__hash ON hashes(hash);