Linus Torvalds's dcache patch
[Posted August 8, 2012 by jake]
               
               
 
commit 8c8d4bb18dbaacfe88a4ce758610b386028499ba
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu May 31 10:26:52 2012 -0700
    vfs: add simple direct-mapped dcache lookup front-end
    
    I've pushed __d_lookup_rcu() just about as far as I could, and it still
    had some problems.
    
    The problems were mainly due to:
    
     - the complexity of the slow-case handling causes register spills
    
     - the hash chain lookup loop causes not only register pressure, but
       also the extra magic "mask off lock bit from the hash chain head
       pointer" etc logic
    
     - the hash list need to be dynamically sized (we want *big* caches, but
       you can't use the same size for big and small machines), which causes
       the initial hash lookup itself to be more complex.
    
    This looks like a viable solution to all three problems, and it is
    actually surprisingly simple: make a trivial fixed-size direct-mapped L1
    dentry cache.  No chains, no locking, no nothing.
    
    This gives measurable improvement on my microbenchmark, and gets good
    hit-rates on both kernel compiles and even on something like "updatedb",
    which I'd have expected to be one of the worst possible cases.
    Apparently updatedb still ends up looking up the same files (/etc/fstab
    etc) a lot.  So those good hit-rates seem to often be due to really
    stupid programming, but hey, I think we all agree that "stupid
    programming" is likely the common case that we generally do need to also
    optimize for ;)
    
    For my kernel compile benchmark ("make -j" on a fully built tree), the
    profile shows (this is kernel-only profile, so user space overhead
    removed):
    
      8.19%  [k] link_path_walk
      7.74%  [k] __d_lookup_rcu
      5.66%  [k] selinux_inode_permission
      3.73%  [k] do_lookup
      2.86%  [k] path_lookupat
      2.72%  [k] avc_has_perm_noaudit
      2.71%  [k] inode_has_perm.isra.49.constprop.71
      2.68%  [k] avc_lookup
      2.51%  [k] generic_permission
      ...
      0.78%  [k] __d_lookup_rcu_slow
      ...
    
    where "__d_lookup_rcu_slow()" is the exact same old __d_lookup_rcu(), so
    it's not really "slow", but it's quite noticeably slower than the new
    streamlined __d_lookup_rcu().  And as you can tell, that means that we
    must have a 90%+ hitrate in the new L1 dcache lookup, since we only see
    10% as much time in the slow routine as in the L1 front-end.
    
    HOWEVER. The fast L1 lookup right now is subtly buggy: not the lookup
    itself, but the code that adds entries to the L1 is racy with those
    entries being removed. I added a comment on it, and I think it's quite
    solvable (and it's all in the slow-path), but I'll need to think it
    through.
    
    Cc: Al Viro <viro@zeniv.linux.org.uk>
    Cc: Nick Piggin <npiggin@gmail.com>
    Cc: Miklos Szeredi <mszeredi@suse.cz>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 79 insertions(+), 2 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 4435d8b32904..f549f6505e53 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -402,6 +402,45 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 }
 
 /*
+ * This has a NULL parent and zero length, and will thus
+ * never match anything. But it means that the dcache_l1
+ * array never contains NULL, so you don't need to check.
+ */
+static struct dentry invalid_dentry;
+
+#define L1_DCACHE_BITS (13)
+#define L1_DCACHE_SIZE (1u << L1_DCACHE_BITS)
+static struct dentry *dcache_l1[L1_DCACHE_SIZE] = {
+	[0 ... L1_DCACHE_SIZE-1] = &invalid_dentry
+};
+
+static unsigned int dentry_l1_index(unsigned int hash, const struct dentry *parent)
+{
+	hash += (unsigned long) parent / L1_CACHE_BYTES;
+	hash = hash + (hash >> L1_DCACHE_BITS);
+	return hash & (L1_DCACHE_SIZE-1);
+}
+
+/* Must be called with d_lock held */
+static void d_remove_from_l1(const struct dentry *dentry)
+{
+	unsigned int n = dentry_l1_index(dentry->d_name.hash, dentry->d_parent);
+	dcache_l1[n] = &invalid_dentry;
+}
+
+static void d_add_to_l1(struct dentry *dentry, unsigned int hash, const struct dentry *parent)
+{
+	unsigned int n = dentry_l1_index(hash, parent);
+	dcache_l1[n] = dentry;
+}
+
+static struct dentry *d_lookup_l1(unsigned int hash, const struct dentry *parent)
+{
+	unsigned int n = dentry_l1_index(hash, parent);
+	return ACCESS_ONCE(dcache_l1[n]);
+}
+
+/*
  * Unhash a dentry without inserting an RCU walk barrier or checking that
  * dentry->d_lock is locked.  The caller must take care of that, if
  * appropriate.
@@ -416,6 +455,7 @@ static void __d_shrink(struct dentry *dentry)
 			b = d_hash(dentry->d_parent, dentry->d_name.hash);
 
 		hlist_bl_lock(b);
+		d_remove_from_l1(dentry);
 		__hlist_bl_del(&dentry->d_hash);
 		dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
@@ -1825,7 +1865,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
  * NOTE! The caller *has* to check the resulting dentry against the sequence
  * number we've returned before using any of the resulting dentry state!
  */
-struct dentry *__d_lookup_rcu(const struct dentry *parent,
+static noinline struct dentry *__d_lookup_rcu_slow(const struct dentry *parent,
 				const struct qstr *name,
 				unsigned *seqp, struct inode *inode)
 {
@@ -1896,12 +1936,49 @@ seqretry:
 
 		if (dentry->d_name.hash_len != hashlen)
 			continue;
-		if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
+		if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) {
+			/* FIXME! RACY! What if it just got unhashed? */
+			d_add_to_l1(dentry, hashlen_hash(hashlen), parent);
 			return dentry;
+		}
 	}
 	return NULL;
 }
 
+/*
+ * Fast non-chained L1 hash lookup.
+ *
+ * NOTE! We don't need to worry about DCACHE_OP_COMPARE, because
+ * dentries with complex parents never get added to the L1 cache.
+ *
+ * We also don't need to worry about d_lookup_l1() returning NULL,
+ * because we fill the cache with otherwise valid dentries that do
+ * not match anything.
+ */
+struct dentry *__d_lookup_rcu(const struct dentry *parent,
+				const struct qstr *name,
+				unsigned *seqp, struct inode *inode)
+{
+	u64 hashlen = name->hash_len;
+	struct dentry *dentry = d_lookup_l1(hashlen_hash(hashlen), parent);
+	unsigned int seq;
+
+	do {
+		seq = raw_seqcount_begin(&dentry->d_seq);
+		if (unlikely(dentry->d_parent != parent))
+			break;
+		*seqp = seq;
+		if (unlikely(dentry->d_name.hash_len != hashlen))
+			break;
+		if (unlikely(dentry_cmp(dentry, name->name, hashlen_len(hashlen))))
+			break;
+		if (unlikely(d_unhashed(dentry)))
+			break;
+		return dentry;
+	} while (0);
+	return __d_lookup_rcu_slow(parent, name, seqp, inode);
+}
+
 /**
  * d_lookup - search for a dentry
  * @parent: parent dentry