fusetoys

various hacky fuse filesystem utilities
git clone https://a3nm.net/git/fusetoys/
Log | Files | Refs | README

commit f9e03620ab77a3690599fc0ca1460d22194388f2
parent 64fc5d4825616061467df4d4ccc0dba64338f62b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri, 28 Dec 2012 23:56:03 +0100

add all

Diffstat:
README | 23+++++++++++++++++++++++
TODO | 3+++
cachefs.py | 79+++++++++++++++++++++++++++++++++++++++++--------------------------------------
loopfs.py | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
metacachefs.py | 199+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 386 insertions(+), 38 deletions(-)

diff --git a/README b/README @@ -0,0 +1,23 @@ +1. loopfs.py SOURCE TARGET +Mount SOURCE on TARGET, forward everything from TARGET to SOURCE. + +2. metacachefs.py source target +Mount SOURCE on TARGET, keep an unbounded in-memory cache of all the metadata +and use it whenever possible. Write operations invalidate relevant parts of the +cache. + +3. cachefs.py --cache=CACHE --size=SIZE SOURCE TARGET +Mount SOURCE on TARGET and, whenever a file is read from TARGET, replicate it in +CACHE and use the version from CACHE. Ensure that the disk usage of CACHE stays +below SIZE by removing the files of CACHE with a minimal last access date. +Stores the last access dates in CACHE/.cache.db. The files are replicated in +CACHE following the hierarchy of SOURCE, so that CACHE can be used (read-only) +as an incomplete copy of SOURCE whenever SOURCE is unavailable. Write operations +are performed on SOURCE and on CACHE for cached files. + +cachefs.py and metacachefs.py are meant to be used on high-latency, low +throughput filesystems such as sshfs or curlftpfs. + +This code is just a toy and has only been tested in trivial settings. It will +most probably not work, and could break things. + diff --git a/TODO b/TODO @@ -0,0 +1,3 @@ +- fail more gracefully for missing endpoints +- partial reads, or bufferize as much as needed +- don't cache for a small read diff --git a/cachefs.py b/cachefs.py @@ -11,6 +11,8 @@ import time import heapq from threading import Lock +DB = ".cache.db" + class __Struct: def __init__(self, **kw): for k, v in kw.iteritems(): @@ -26,7 +28,7 @@ fuse.fuse_python_api = (0, 2) class File: @property def size(self): - return os.stat(os.path.join(self.cache.path, self.path)).st_size + return os.lstat(os.path.join(self.cache.path, self.path)).st_size def __init__(self, cache, path): self.cache = cache @@ -55,6 +57,8 @@ class Cache: dirpath, dirnames, filenames = x for f in filenames: full = os.path.join(dirpath, f)[len(self.path):] + if full == DB: + continue if full not in self.files.keys(): self.addFile(full) @@ -103,35 +107,34 @@ class CacheFS(fuse.Fuse): self.logger = logging.getLogger('cachefs') self.logger.setLevel(logging.INFO) self.logger.addHandler(logging.StreamHandler(sys.stderr)) - self.parser.add_option('--source', dest='source', metavar='SOURCE', - help="source") self.parser.add_option('--cache', dest='cache', metavar='CACHE', - help="cache") - self.parser.add_option('--db', dest='db', metavar='DB', - help="db") + help="cache (mandatory)") + self.parser.add_option('--db', dest='db', metavar='DB', default=DB, + help="db location relative to the cache") self.parser.add_option('--size', dest='size', metavar='SIZE', - type='int', help="size") + type='int', help="maximal size of cache") print self def fsinit(self): options = self.cmdline[0] - self.sourceRoot = options.source + args = self.cmdline[1] + print args + self.sourceRoot = args[0] self.cacheRoot = options.cache self.db = options.db self.size = options.size - print "will load" - print self + if not self.size: + vfs = os.statvfs(self.cacheRoot) + # half of available space on the cache fs + self.size = (vfs.f_bavail * vfs.f_bsize) / 2 + print "size is %d" % self.size try: with open(self.db, 'rb') as f: self.cache = pickle.load(f) assert(self.cache != None) - print "loaded" except Exception as e: - print "a problem occurred, have a fresh cache" self.cache = Cache(self.size, self.cacheRoot) - print self.cache - print "AHA" if (self.cache.maxSize > self.size): self.makeRoom(self.cache.maxSize - self.size) self.cache.maxSize = self.size @@ -150,6 +153,7 @@ class CacheFS(fuse.Fuse): def makeRoom(self, bytes): # TODO maybe don't flush all the cache for a big file even if it fits... + # TODO adjust for the available size of the underlying FS if bytes > self.cache.maxSize: raise FileTooBigException() print("now current free size is %d and must fit %d" % @@ -167,9 +171,11 @@ class CacheFS(fuse.Fuse): def isCached(self, path): """is a path cached?""" + if path == "/" + DB: + return False if os.path.exists(self.cachePath(path)): - statOriginal = os.stat(self.cachePath(path)) - statCache = os.stat(self.cachePath(path)) + statOriginal = os.lstat(self.cachePath(path)) + statCache = os.lstat(self.cachePath(path)) if statOriginal.st_size == statCache.st_size: # the cache file is good # TODO better checks @@ -177,29 +183,30 @@ class CacheFS(fuse.Fuse): return False def prepare(self, path): - #if not flags & os.O_RDONLY: - # return self.sourcePath(path) - print "PREPARATION" + if path == "/" + DB: + return self.sourcePath(path) if not os.path.exists(self.sourcePath(path)): # no such original file, let the source handle it - print "*** original missing" return self.sourcePath(path) if self.isCached(path): - print "*** already cached" self.registerHit(path) return self.cachePath(path) - statOriginal = os.stat(self.sourcePath(path)) + statOriginal = os.lstat(self.sourcePath(path)) # cache the file and then open it - print "PREPARATIONca" with self.rwlock: try: - print "*** make room" self.makeRoom(statOriginal.st_size) except FileTooBigException: # no room to cache, open the original file return self.sourcePath(path) - print ("*** docopy from %s to %s" % (self.sourcePath(path), - self.cachePath(path))) + # create folder hierarchy + head, tail = os.path.split(self.cachePath(path)) + try: + os.makedirs(head) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise shutil.copy2(self.sourcePath(path), self.cachePath(path)) self.cache.addFile(path[1:]) return self.cachePath(path) @@ -225,9 +232,8 @@ class CacheFS(fuse.Fuse): def create(self, path, flags, mode): return os.open(self.sourcePath(path), os.O_WRONLY | os.O_CREAT, mode) - def getattr(self, path, fh=None): - st = os.lstat(self.sourcePath(path)) - return st + def getattr(self, path): + return os.lstat(self.sourcePath(path)) def link(self, target, source): os.link(self.sourcePath(source), self.sourcePath(target)) @@ -235,8 +241,8 @@ class CacheFS(fuse.Fuse): def mkdir(self, path, mode): return os.mkdir(self.sourcePath(path), mode) - def mknod(self, filename): - return os.mknod(self.sourcePath(filename)) + def mknod(self, path, mode, rdev): + return os.mknod(self.sourcePath(path), mode, rdev) def open(self, path, flags): #print("will call open %s %s" % (self.prepare(path), flags)) @@ -252,22 +258,19 @@ class CacheFS(fuse.Fuse): os.close(fh) return x - def readdir(self, path, fh): + def readdir(self, path, offset): path = self.sourcePath(path) - myIno = os.stat(path).st_ino + myIno = os.lstat(path).st_ino yield fuse.Direntry('.', ino=myIno) - parentIno = os.stat(os.path.join(path, "..")).st_ino + parentIno = os.lstat(os.path.join(path, "..")).st_ino yield fuse.Direntry('..', ino=parentIno) for name in os.listdir(path): - ino = os.stat(os.path.join(path, name)).st_ino + ino = os.lstat(os.path.join(path, name)).st_ino yield fuse.Direntry(name, ino=ino) def readlink(self, path): return os.readlink(self.sourcePath(path)) - def release(self, path, fh): - return os.close(fh) - def rename(self, old, new): wasCached = self.isCached(old) retval = os.rename(self.sourcePath(old), self.sourcePath(new)) diff --git a/loopfs.py b/loopfs.py @@ -0,0 +1,120 @@ +#!/usr/bin/python + +"""LoopFS -- a loopback file system with FUSE""" + +import fuse +import errno +import os +import sys +import threading + +fuse.fuse_python_api = (0, 2) + +class LoopFS(fuse.Fuse): + def __init__(self, *args, **kw): + fuse.Fuse.__init__(self, *args, **kw) + + self.rwlock = threading.Lock() + + def fsinit(self): + self.source = self.cmdline[1][0] + + def sourcePath(self, path): + return os.path.join(self.source, path[1:]) + + def access(self, path, mode): + if not os.access(self.sourcePath(path), mode): + return -errno.EACCES + + def chmod(self, path, mode): + return os.chmod(self.sourcePath(path), mode) + + def chown(self, path, mode): + return os.chown(self.sourcePath(path), mode) + + def create(self, path, flags, mode): + return os.open(self.sourcePath(path), os.O_WRONLY | os.O_CREAT, mode) + + def getattr(self, path): + return os.lstat(self.sourcePath(path)) + + def link(self, target, source): + os.link(self.sourcePath(source), self.sourcePath(target)) + + def mkdir(self, path, mode): + return os.mkdir(self.sourcePath(path), mode) + + def mknod(self, path, mode, rdev): + return os.mknod(self.sourcePath(path), mode, rdev) + + def open(self, path, flags): + return 0 + + def read(self, path, size, offset): + with self.rwlock: + fh = os.open(self.sourcePath(path), os.O_RDONLY) + os.lseek(fh, offset, 0) + x = os.read(fh, size) + os.close(fh) + return x + + def readdir(self, path, offset): + path = self.sourcePath(path) + myIno = os.lstat(path).st_ino + print "will yield dot" + yield fuse.Direntry('.', ino=myIno) + print "will yield parent" + try: + parentIno = os.lstat(os.path.join(path, "..")).st_ino + print "yielded parent" + except OSError as e: + parentIno = myIno # root + print "faked parent" + yield fuse.Direntry('..', ino=parentIno) + print "will yield children" + for name in os.listdir(path): + print "yield %s" % os.path.join(path, name) + ino = os.lstat(os.path.join(path, name)).st_ino + yield fuse.Direntry(name, ino=ino) + print "alldone" + + def readlink(self, path): + return os.readlink(self.sourcePath(path)) + + def rename(self, old, new): + return os.rename(self.sourcePath(old), self.sourcePath(new)) + + def rmdir(self, path): + return os.rmdir(self.sourcePath(path)) + + def statfs(self): + return os.statvfs(self.sourceRoot) + + def symlink(self, target, source): + return os.symlink(self.sourcePath(source), self.sourcePath(target)) + + def truncate(self, path, length, fh=None): + with open(self.sourcePath(path), 'r+') as f: + return f.truncate(length) + + def unlink(self, path): + return os.unlink(self.sourcePath(path)) + + def utimens(self, path, ts_acc, ts_mod): + times = (ts_acc.tv_sec, ts_mod.tv_sec) + return os.utime(self.sourcePath(path), times) + + def write(self, path, data, offset): + with self.rwlock: + fh = os.open(self.sourcePath(path), os.O_WRONLY) + os.lseek(fh, offset, 0) + x = os.write(fh, data) + os.close(fh) + return x + + +if __name__ == "__main__": + loopfs = LoopFS() + fuse_opts = loopfs.parse(['-o', 'fsname=loopfs'] + sys.argv[1:]) + loopfs.main() + diff --git a/metacachefs.py b/metacachefs.py @@ -0,0 +1,199 @@ +#!/usr/bin/python + +"""MetaCacheFS -- a file system to cache metadata""" + +import fuse +import errno +import os +import sys +from loopfs import LoopFS + +fuse.fuse_python_api = (0, 2) + +class MissingNode: + def __init__(self): + self.access = -errno.ENOENT + self.getattr = -errno.ENOENT + self.readlink = -errno.ENOENT + self.readdir = -errno.ENOENT + +class Node: + def __init__(self): + self.flush() + + def flush(self): + self.access = None + self.getattr = None + self.readlink = None + self.flushFolder() + + def flushFolder(self): + self.readdir = None + + +class MetaCacheFS(LoopFS): + def __init__(self, *args, **kw): + LoopFS.__init__(self, *args, **kw) + self.files = {} + + def flushIfPresent(self, path): + if path in self.files.keys(): + self.files[path].flush() + def deleteIfPresent(self, path): + if path in self.files.keys(): + del self.files[path] + def getFolder(self, path): + head, tail = os.path.split(path) + return head + + def flushFolder(self, path): + if self.getFolder(path) in self.files.keys(): + self.files[self.getFolder(path)].flushFolder() + + def exceptionToStatus(self, e): + if isinstance(e, OSError): + return -e.errno + else: + return -errno.ENOENT + + def getOrCreate(self, path): + if path not in self.files.keys(): + head, tail = os.path.split(path) + if head in self.files.keys(): + d = self.files[head] + if d.readdir != None and tail not in [x.name for x in + d.readdir]: + return MissingNode() # we know it can't exist + self.files[path] = Node() + return self.files[path] + + def access(self, path, mode): + pathh = os.path.normpath(path) + f = self.getOrCreate(pathh) + if f.access == None: + f.access = super(MetaCacheFS, self).access(path, mode) + return f.access + + def chmod(self, path, mode): + pathh = os.path.normpath(path) + self.flushIfPresent(pathh) + return super(MetaCacheFS, self).chmod(path, mode) + + def chown(self, path, mode): + pathh = os.path.normpath(path) + self.flushIfPresent(pathh) + return super(MetaCacheFS, self).chown(path, mode) + + def create(self, path, flags, mode): + pathh = os.path.normpath(path) + self.flushFolder(pathh) + return super(MetaCacheFS, self).create(path, flags, mode) + + def getattr(self, path): + pathh = os.path.normpath(path) + f = self.getOrCreate(pathh) + if f.getattr == None: + try: + x = super(MetaCacheFS, self).getattr(path) + except Exception as e: + x = self.exceptionToStatus(e) + f.getattr = x + return f.getattr + + def link(self, target, source): + target2 = os.path.normpath(target) + self.flushFolder(target2) + + return super(MetaCacheFS, self).link(target, source) + + def mkdir(self, path, mode): + pathh = os.path.normpath(path) + self.flushFolder(pathh) + return super(MetaCacheFS, self).mkdir(path, mode) + + def mknod(self, path, mode, rdev): + pathh = os.path.normpath(path) + self.flushFolder(pathh) + return super(MetaCacheFS, self).mknod(path, mode, rdev) + + def open(self, path, flags): + return super(MetaCacheFS, self).open(path, flags) + + def read(self, path, size, offset): + return super(MetaCacheFS, self).read(path, size, offset) + + def readdir(self, path, offset): + pathh = os.path.normpath(path) + f = self.getOrCreate(pathh) + if f.readdir == None: + f.readdir = list(super(MetaCacheFS, self).readdir(path, offset)) + return f.readdir + + def readlink(self, path): + pathh = os.path.normpath(path) + f = self.getOrCreate(pathh) + if f.readlink == None: + try: + x = super(MetaCacheFS, self).readlink(path) + except Exception as e: + x = self.except_to_status(e) + f.readlink = x + return f.readlink + + def rename(self, old, new): + old2 = os.path.normpath(old) + new2 = os.path.normpath(new) + self.flushIfPresent(old2) + self.flushIfPresent(new2) + self.flushFolder(new2) + self.flushFolder(old2) + return super(MetaCacheFS, self).rename(old, new) + + def rmdir(self, path): + pathh = os.path.normpath(path) + self.deleteIfPresent(pathh) + self.flushFolder(pathh) + return super(MetaCacheFS, self).rmdir(path) + + def statfs(self): + return super(MetaCacheFS, self).statfs() + + def symlink(self, target, source): + target2 = os.path.normpath(target) + self.flushFolder(target2) + + return super(MetaCacheFS, self).symlink(target, source) + + def truncate(self, path, length): + pathh = os.path.normpath(path) + self.flushIfPresent(pathh) + return super(MetaCacheFS, self).truncate(path, length) + + def unlink(self, path): + pathh = os.path.normpath(path) + self.deleteIfPresent(pathh) + self.flushFolder(pathh) + return super(MetaCacheFS, self).unlink(path) + + def utimens(self, path, ts_acc, ts_mod): + pathh = os.path.normpath(path) + self.flushIfPresent(pathh) + return super(MetaCacheFS, self).utimens(path, ts_acc, ts_mod) + + def write(self, path, data, offset): + pathh = os.path.normpath(path) + self.flushIfPresent(pathh) + return super(MetaCacheFS, self).write(path, data, offset) + + +if __name__ == "__main__": + #if len(sys.argv) != 6: + # print("Usage: %s SOURCE CACHE SIZE DB MOUNTPOINT" % sys.argv[0]) + # sys.exit(1) + #logging.getLogger().setLevel(logging.DEBUG) + #cachefs = CacheFS(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) + + metacachefs = MetaCacheFS() + fuse_opts = metacachefs.parse(['-o', 'fsname=metacachefs'] + sys.argv[1:]) + metacachefs.main() +