commit f9e03620ab77a3690599fc0ca1460d22194388f2
parent 64fc5d4825616061467df4d4ccc0dba64338f62b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Fri, 28 Dec 2012 23:56:03 +0100
add all
Diffstat:
README | | | 23 | +++++++++++++++++++++++ |
TODO | | | 3 | +++ |
cachefs.py | | | 79 | +++++++++++++++++++++++++++++++++++++++++-------------------------------------- |
loopfs.py | | | 120 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
metacachefs.py | | | 199 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 386 insertions(+), 38 deletions(-)
diff --git a/README b/README
@@ -0,0 +1,23 @@
+1. loopfs.py SOURCE TARGET
+Mount SOURCE on TARGET, forward everything from TARGET to SOURCE.
+
+2. metacachefs.py source target
+Mount SOURCE on TARGET, keep an unbounded in-memory cache of all the metadata
+and use it whenever possible. Write operations invalidate relevant parts of the
+cache.
+
+3. cachefs.py --cache=CACHE --size=SIZE SOURCE TARGET
+Mount SOURCE on TARGET and, whenever a file is read from TARGET, replicate it in
+CACHE and use the version from CACHE. Ensure that the disk usage of CACHE stays
+below SIZE by removing the files of CACHE with a minimal last access date.
+Stores the last access dates in CACHE/.cache.db. The files are replicated in
+CACHE following the hierarchy of SOURCE, so that CACHE can be used (read-only)
+as an incomplete copy of SOURCE whenever SOURCE is unavailable. Write operations
+are performed on SOURCE and on CACHE for cached files.
+
+cachefs.py and metacachefs.py are meant to be used on high-latency, low
+throughput filesystems such as sshfs or curlftpfs.
+
+This code is just a toy and has only been tested in trivial settings. It will
+most probably not work, and could break things.
+
diff --git a/TODO b/TODO
@@ -0,0 +1,3 @@
+- fail more gracefully for missing endpoints
+- partial reads, or bufferize as much as needed
+- don't cache for a small read
diff --git a/cachefs.py b/cachefs.py
@@ -11,6 +11,8 @@ import time
import heapq
from threading import Lock
+DB = ".cache.db"
+
class __Struct:
def __init__(self, **kw):
for k, v in kw.iteritems():
@@ -26,7 +28,7 @@ fuse.fuse_python_api = (0, 2)
class File:
@property
def size(self):
- return os.stat(os.path.join(self.cache.path, self.path)).st_size
+ return os.lstat(os.path.join(self.cache.path, self.path)).st_size
def __init__(self, cache, path):
self.cache = cache
@@ -55,6 +57,8 @@ class Cache:
dirpath, dirnames, filenames = x
for f in filenames:
full = os.path.join(dirpath, f)[len(self.path):]
+ if full == DB:
+ continue
if full not in self.files.keys():
self.addFile(full)
@@ -103,35 +107,34 @@ class CacheFS(fuse.Fuse):
self.logger = logging.getLogger('cachefs')
self.logger.setLevel(logging.INFO)
self.logger.addHandler(logging.StreamHandler(sys.stderr))
- self.parser.add_option('--source', dest='source', metavar='SOURCE',
- help="source")
self.parser.add_option('--cache', dest='cache', metavar='CACHE',
- help="cache")
- self.parser.add_option('--db', dest='db', metavar='DB',
- help="db")
+ help="cache (mandatory)")
+ self.parser.add_option('--db', dest='db', metavar='DB', default=DB,
+ help="db location relative to the cache")
self.parser.add_option('--size', dest='size', metavar='SIZE',
- type='int', help="size")
+ type='int', help="maximal size of cache")
print self
def fsinit(self):
options = self.cmdline[0]
- self.sourceRoot = options.source
+ args = self.cmdline[1]
+ print args
+ self.sourceRoot = args[0]
self.cacheRoot = options.cache
self.db = options.db
self.size = options.size
- print "will load"
- print self
+ if not self.size:
+ vfs = os.statvfs(self.cacheRoot)
+ # half of available space on the cache fs
+ self.size = (vfs.f_bavail * vfs.f_bsize) / 2
+ print "size is %d" % self.size
try:
with open(self.db, 'rb') as f:
self.cache = pickle.load(f)
assert(self.cache != None)
- print "loaded"
except Exception as e:
- print "a problem occurred, have a fresh cache"
self.cache = Cache(self.size, self.cacheRoot)
- print self.cache
- print "AHA"
if (self.cache.maxSize > self.size):
self.makeRoom(self.cache.maxSize - self.size)
self.cache.maxSize = self.size
@@ -150,6 +153,7 @@ class CacheFS(fuse.Fuse):
def makeRoom(self, bytes):
# TODO maybe don't flush all the cache for a big file even if it fits...
+ # TODO adjust for the available size of the underlying FS
if bytes > self.cache.maxSize:
raise FileTooBigException()
print("now current free size is %d and must fit %d" %
@@ -167,9 +171,11 @@ class CacheFS(fuse.Fuse):
def isCached(self, path):
"""is a path cached?"""
+ if path == "/" + DB:
+ return False
if os.path.exists(self.cachePath(path)):
- statOriginal = os.stat(self.cachePath(path))
- statCache = os.stat(self.cachePath(path))
+ statOriginal = os.lstat(self.cachePath(path))
+ statCache = os.lstat(self.cachePath(path))
if statOriginal.st_size == statCache.st_size:
# the cache file is good
# TODO better checks
@@ -177,29 +183,30 @@ class CacheFS(fuse.Fuse):
return False
def prepare(self, path):
- #if not flags & os.O_RDONLY:
- # return self.sourcePath(path)
- print "PREPARATION"
+ if path == "/" + DB:
+ return self.sourcePath(path)
if not os.path.exists(self.sourcePath(path)):
# no such original file, let the source handle it
- print "*** original missing"
return self.sourcePath(path)
if self.isCached(path):
- print "*** already cached"
self.registerHit(path)
return self.cachePath(path)
- statOriginal = os.stat(self.sourcePath(path))
+ statOriginal = os.lstat(self.sourcePath(path))
# cache the file and then open it
- print "PREPARATIONca"
with self.rwlock:
try:
- print "*** make room"
self.makeRoom(statOriginal.st_size)
except FileTooBigException:
# no room to cache, open the original file
return self.sourcePath(path)
- print ("*** docopy from %s to %s" % (self.sourcePath(path),
- self.cachePath(path)))
+ # create folder hierarchy
+ head, tail = os.path.split(self.cachePath(path))
+ try:
+ os.makedirs(head)
+ except OSError as exc: # Python >2.5
+ if exc.errno == errno.EEXIST and os.path.isdir(path):
+ pass
+ else: raise
shutil.copy2(self.sourcePath(path), self.cachePath(path))
self.cache.addFile(path[1:])
return self.cachePath(path)
@@ -225,9 +232,8 @@ class CacheFS(fuse.Fuse):
def create(self, path, flags, mode):
return os.open(self.sourcePath(path), os.O_WRONLY | os.O_CREAT, mode)
- def getattr(self, path, fh=None):
- st = os.lstat(self.sourcePath(path))
- return st
+ def getattr(self, path):
+ return os.lstat(self.sourcePath(path))
def link(self, target, source):
os.link(self.sourcePath(source), self.sourcePath(target))
@@ -235,8 +241,8 @@ class CacheFS(fuse.Fuse):
def mkdir(self, path, mode):
return os.mkdir(self.sourcePath(path), mode)
- def mknod(self, filename):
- return os.mknod(self.sourcePath(filename))
+ def mknod(self, path, mode, rdev):
+ return os.mknod(self.sourcePath(path), mode, rdev)
def open(self, path, flags):
#print("will call open %s %s" % (self.prepare(path), flags))
@@ -252,22 +258,19 @@ class CacheFS(fuse.Fuse):
os.close(fh)
return x
- def readdir(self, path, fh):
+ def readdir(self, path, offset):
path = self.sourcePath(path)
- myIno = os.stat(path).st_ino
+ myIno = os.lstat(path).st_ino
yield fuse.Direntry('.', ino=myIno)
- parentIno = os.stat(os.path.join(path, "..")).st_ino
+ parentIno = os.lstat(os.path.join(path, "..")).st_ino
yield fuse.Direntry('..', ino=parentIno)
for name in os.listdir(path):
- ino = os.stat(os.path.join(path, name)).st_ino
+ ino = os.lstat(os.path.join(path, name)).st_ino
yield fuse.Direntry(name, ino=ino)
def readlink(self, path):
return os.readlink(self.sourcePath(path))
- def release(self, path, fh):
- return os.close(fh)
-
def rename(self, old, new):
wasCached = self.isCached(old)
retval = os.rename(self.sourcePath(old), self.sourcePath(new))
diff --git a/loopfs.py b/loopfs.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+
+"""LoopFS -- a loopback file system with FUSE"""
+
+import fuse
+import errno
+import os
+import sys
+import threading
+
+fuse.fuse_python_api = (0, 2)
+
+class LoopFS(fuse.Fuse):
+ def __init__(self, *args, **kw):
+ fuse.Fuse.__init__(self, *args, **kw)
+
+ self.rwlock = threading.Lock()
+
+ def fsinit(self):
+ self.source = self.cmdline[1][0]
+
+ def sourcePath(self, path):
+ return os.path.join(self.source, path[1:])
+
+ def access(self, path, mode):
+ if not os.access(self.sourcePath(path), mode):
+ return -errno.EACCES
+
+ def chmod(self, path, mode):
+ return os.chmod(self.sourcePath(path), mode)
+
+ def chown(self, path, mode):
+ return os.chown(self.sourcePath(path), mode)
+
+ def create(self, path, flags, mode):
+ return os.open(self.sourcePath(path), os.O_WRONLY | os.O_CREAT, mode)
+
+ def getattr(self, path):
+ return os.lstat(self.sourcePath(path))
+
+ def link(self, target, source):
+ os.link(self.sourcePath(source), self.sourcePath(target))
+
+ def mkdir(self, path, mode):
+ return os.mkdir(self.sourcePath(path), mode)
+
+ def mknod(self, path, mode, rdev):
+ return os.mknod(self.sourcePath(path), mode, rdev)
+
+ def open(self, path, flags):
+ return 0
+
+ def read(self, path, size, offset):
+ with self.rwlock:
+ fh = os.open(self.sourcePath(path), os.O_RDONLY)
+ os.lseek(fh, offset, 0)
+ x = os.read(fh, size)
+ os.close(fh)
+ return x
+
+ def readdir(self, path, offset):
+ path = self.sourcePath(path)
+ myIno = os.lstat(path).st_ino
+ print "will yield dot"
+ yield fuse.Direntry('.', ino=myIno)
+ print "will yield parent"
+ try:
+ parentIno = os.lstat(os.path.join(path, "..")).st_ino
+ print "yielded parent"
+ except OSError as e:
+ parentIno = myIno # root
+ print "faked parent"
+ yield fuse.Direntry('..', ino=parentIno)
+ print "will yield children"
+ for name in os.listdir(path):
+ print "yield %s" % os.path.join(path, name)
+ ino = os.lstat(os.path.join(path, name)).st_ino
+ yield fuse.Direntry(name, ino=ino)
+ print "alldone"
+
+ def readlink(self, path):
+ return os.readlink(self.sourcePath(path))
+
+ def rename(self, old, new):
+ return os.rename(self.sourcePath(old), self.sourcePath(new))
+
+ def rmdir(self, path):
+ return os.rmdir(self.sourcePath(path))
+
+ def statfs(self):
+ return os.statvfs(self.sourceRoot)
+
+ def symlink(self, target, source):
+ return os.symlink(self.sourcePath(source), self.sourcePath(target))
+
+ def truncate(self, path, length, fh=None):
+ with open(self.sourcePath(path), 'r+') as f:
+ return f.truncate(length)
+
+ def unlink(self, path):
+ return os.unlink(self.sourcePath(path))
+
+ def utimens(self, path, ts_acc, ts_mod):
+ times = (ts_acc.tv_sec, ts_mod.tv_sec)
+ return os.utime(self.sourcePath(path), times)
+
+ def write(self, path, data, offset):
+ with self.rwlock:
+ fh = os.open(self.sourcePath(path), os.O_WRONLY)
+ os.lseek(fh, offset, 0)
+ x = os.write(fh, data)
+ os.close(fh)
+ return x
+
+
+if __name__ == "__main__":
+ loopfs = LoopFS()
+ fuse_opts = loopfs.parse(['-o', 'fsname=loopfs'] + sys.argv[1:])
+ loopfs.main()
+
diff --git a/metacachefs.py b/metacachefs.py
@@ -0,0 +1,199 @@
+#!/usr/bin/python
+
+"""MetaCacheFS -- a file system to cache metadata"""
+
+import fuse
+import errno
+import os
+import sys
+from loopfs import LoopFS
+
+fuse.fuse_python_api = (0, 2)
+
+class MissingNode:
+ def __init__(self):
+ self.access = -errno.ENOENT
+ self.getattr = -errno.ENOENT
+ self.readlink = -errno.ENOENT
+ self.readdir = -errno.ENOENT
+
+class Node:
+ def __init__(self):
+ self.flush()
+
+ def flush(self):
+ self.access = None
+ self.getattr = None
+ self.readlink = None
+ self.flushFolder()
+
+ def flushFolder(self):
+ self.readdir = None
+
+
+class MetaCacheFS(LoopFS):
+ def __init__(self, *args, **kw):
+ LoopFS.__init__(self, *args, **kw)
+ self.files = {}
+
+ def flushIfPresent(self, path):
+ if path in self.files.keys():
+ self.files[path].flush()
+ def deleteIfPresent(self, path):
+ if path in self.files.keys():
+ del self.files[path]
+ def getFolder(self, path):
+ head, tail = os.path.split(path)
+ return head
+
+ def flushFolder(self, path):
+ if self.getFolder(path) in self.files.keys():
+ self.files[self.getFolder(path)].flushFolder()
+
+ def exceptionToStatus(self, e):
+ if isinstance(e, OSError):
+ return -e.errno
+ else:
+ return -errno.ENOENT
+
+ def getOrCreate(self, path):
+ if path not in self.files.keys():
+ head, tail = os.path.split(path)
+ if head in self.files.keys():
+ d = self.files[head]
+ if d.readdir != None and tail not in [x.name for x in
+ d.readdir]:
+ return MissingNode() # we know it can't exist
+ self.files[path] = Node()
+ return self.files[path]
+
+ def access(self, path, mode):
+ pathh = os.path.normpath(path)
+ f = self.getOrCreate(pathh)
+ if f.access == None:
+ f.access = super(MetaCacheFS, self).access(path, mode)
+ return f.access
+
+ def chmod(self, path, mode):
+ pathh = os.path.normpath(path)
+ self.flushIfPresent(pathh)
+ return super(MetaCacheFS, self).chmod(path, mode)
+
+ def chown(self, path, mode):
+ pathh = os.path.normpath(path)
+ self.flushIfPresent(pathh)
+ return super(MetaCacheFS, self).chown(path, mode)
+
+ def create(self, path, flags, mode):
+ pathh = os.path.normpath(path)
+ self.flushFolder(pathh)
+ return super(MetaCacheFS, self).create(path, flags, mode)
+
+ def getattr(self, path):
+ pathh = os.path.normpath(path)
+ f = self.getOrCreate(pathh)
+ if f.getattr == None:
+ try:
+ x = super(MetaCacheFS, self).getattr(path)
+ except Exception as e:
+ x = self.exceptionToStatus(e)
+ f.getattr = x
+ return f.getattr
+
+ def link(self, target, source):
+ target2 = os.path.normpath(target)
+ self.flushFolder(target2)
+
+ return super(MetaCacheFS, self).link(target, source)
+
+ def mkdir(self, path, mode):
+ pathh = os.path.normpath(path)
+ self.flushFolder(pathh)
+ return super(MetaCacheFS, self).mkdir(path, mode)
+
+ def mknod(self, path, mode, rdev):
+ pathh = os.path.normpath(path)
+ self.flushFolder(pathh)
+ return super(MetaCacheFS, self).mknod(path, mode, rdev)
+
+ def open(self, path, flags):
+ return super(MetaCacheFS, self).open(path, flags)
+
+ def read(self, path, size, offset):
+ return super(MetaCacheFS, self).read(path, size, offset)
+
+ def readdir(self, path, offset):
+ pathh = os.path.normpath(path)
+ f = self.getOrCreate(pathh)
+ if f.readdir == None:
+ f.readdir = list(super(MetaCacheFS, self).readdir(path, offset))
+ return f.readdir
+
+ def readlink(self, path):
+ pathh = os.path.normpath(path)
+ f = self.getOrCreate(pathh)
+ if f.readlink == None:
+ try:
+ x = super(MetaCacheFS, self).readlink(path)
+ except Exception as e:
+ x = self.except_to_status(e)
+ f.readlink = x
+ return f.readlink
+
+ def rename(self, old, new):
+ old2 = os.path.normpath(old)
+ new2 = os.path.normpath(new)
+ self.flushIfPresent(old2)
+ self.flushIfPresent(new2)
+ self.flushFolder(new2)
+ self.flushFolder(old2)
+ return super(MetaCacheFS, self).rename(old, new)
+
+ def rmdir(self, path):
+ pathh = os.path.normpath(path)
+ self.deleteIfPresent(pathh)
+ self.flushFolder(pathh)
+ return super(MetaCacheFS, self).rmdir(path)
+
+ def statfs(self):
+ return super(MetaCacheFS, self).statfs()
+
+ def symlink(self, target, source):
+ target2 = os.path.normpath(target)
+ self.flushFolder(target2)
+
+ return super(MetaCacheFS, self).symlink(target, source)
+
+ def truncate(self, path, length):
+ pathh = os.path.normpath(path)
+ self.flushIfPresent(pathh)
+ return super(MetaCacheFS, self).truncate(path, length)
+
+ def unlink(self, path):
+ pathh = os.path.normpath(path)
+ self.deleteIfPresent(pathh)
+ self.flushFolder(pathh)
+ return super(MetaCacheFS, self).unlink(path)
+
+ def utimens(self, path, ts_acc, ts_mod):
+ pathh = os.path.normpath(path)
+ self.flushIfPresent(pathh)
+ return super(MetaCacheFS, self).utimens(path, ts_acc, ts_mod)
+
+ def write(self, path, data, offset):
+ pathh = os.path.normpath(path)
+ self.flushIfPresent(pathh)
+ return super(MetaCacheFS, self).write(path, data, offset)
+
+
+if __name__ == "__main__":
+ #if len(sys.argv) != 6:
+ # print("Usage: %s SOURCE CACHE SIZE DB MOUNTPOINT" % sys.argv[0])
+ # sys.exit(1)
+ #logging.getLogger().setLevel(logging.DEBUG)
+ #cachefs = CacheFS(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
+
+ metacachefs = MetaCacheFS()
+ fuse_opts = metacachefs.parse(['-o', 'fsname=metacachefs'] + sys.argv[1:])
+ metacachefs.main()
+