[OpenAFS-devel] speed up namei volume operations by grouping fsyncs

Rainer Toebbicke rtb@pclella.cern.ch
Fri, 21 Apr 2006 09:06:38 +0200


This is a multi-part message in MIME format.
--------------000402090402030105010209
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

The attached patch does not break with the namei tradition to fsync 
the link count file at every change, although the need for this is 
still debatable.

Rather, it groups fsyncs to the link count file in batches, with a 
final fsync prior to releasing the volume back to the file server. It 
hence speeds up clone and related operations (vos backup, vos move, 
salvage, etc) tremendously, most noticeable for volumes with a large 
number of files (hundreds of thousands), where we observed speedups in 
excess of 200...


(Bcc-ed to openafs-bugs)



-- 
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rainer Toebbicke
European Laboratory for Particle Physics(CERN) - Geneva, Switzerland
Phone: +41 22 767 8985       Fax: +41 22 767 7155

--------------000402090402030105010209
Content-Type: text/plain;
 name="patch_namei_speedup"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="patch_namei_speedup"

unchanged:
--- openafs/src/vol/ihandle.h.o1381	2004-08-25 09:14:19.000000000 +0200
+++ openafs/src/vol/ihandle.h	2005-05-24 15:22:17.000000000 +0200
@@ -221,6 +221,7 @@
 
 /* Flags for the Inode handle */
 #define IH_REALLY_CLOSED		1
+#define IH_DELAY_SYNC			16
 
 /* Hash function for inode handles */
 #define I_HANDLE_HASH_SIZE	1024	/* power of 2 */
@@ -466,7 +467,7 @@
 #define FDH_WRITE(H, B, S) OS_WRITE((H)->fd_fd, B, S)
 #define FDH_SEEK(H, O, F) OS_SEEK((H)->fd_fd, O, F)
 
-#define FDH_SYNC(H) OS_SYNC((H)->fd_fd)
+#define FDH_SYNC(H) ((H->fd_ih->ih_flags&IH_DELAY_SYNC) ? 0 : OS_SYNC((H)->fd_fd))
 #define FDH_TRUNC(H, L) OS_TRUNC((H)->fd_fd, L)
 #define FDH_SIZE(H) OS_SIZE((H)->fd_fd)
 
unchanged:
--- openafs/src/vol/purge.c.o1381	2004-08-25 09:14:19.000000000 +0200
+++ openafs/src/vol/purge.c	2005-05-25 11:13:16.000000000 +0200
@@ -89,7 +89,7 @@
     VOL_UNLOCK;
 }
 
-#define MAXOBLITATONCE	200
+#define MAXOBLITATONCE	1000
 /* delete a portion of an index, adjusting offset appropriately.  Returns 0 if
    things work and we should be called again, 1 if success full and done, and -1
    if an error occurred.  It adjusts offset appropriately on 0 or 1 return codes,
@@ -148,10 +148,13 @@
     OS_SYNC(afile->str_fd);
 
     /* finally, do the idec's */
+    V_linkHandle(avp)->ih_flags|=IH_DELAY_SYNC;		/* severe performance penalty */
     for (i = 0; i < iindex; i++) {
 	IH_DEC(V_linkHandle(avp), inodes[i], V_parentId(avp));
 	DOPOLL;
     }
+    V_linkHandle(avp)->ih_flags&=~IH_DELAY_SYNC;
+    IH_CONDSYNC(V_linkHandle(avp));
 
     /* return the new offset */
     *aoffset = offset;
unchanged:
--- openafs/src/vol/clone.c.o1381	2004-08-25 09:14:19.000000000 +0200
+++ openafs/src/vol/clone.c	2005-05-27 09:39:04.000000000 +0200
@@ -227,6 +227,12 @@
     decRock.vol = V_parentId(rwvp);
 
     /* Read each vnode in the old volume's index file */
+    /* fsyncing the link count file for every inode has a severe 
+       performance penalty, therefore we turn it off temporarily.
+       This assumes we're the only one on that file/volume  -
+       in particular when we force the fsync later!
+    */
+    V_linkHandle(rwvp)->ih_flags|=IH_DELAY_SYNC;
     for (offset = vcp->diskSize;
 	 STREAM_READ(rwvnode, vcp->diskSize, 1, rwfile) == 1;
 	 offset += vcp->diskSize) {
@@ -345,6 +351,12 @@
      * and shouldn't do the idecs.
      */
   error_exit:
+    /* Now take the fsync-bypass away again and force an fsync.
+       Again: assumes we're alone on this file, otherwise we need a lock!
+    */
+    V_linkHandle(rwvp)->ih_flags&=~IH_DELAY_SYNC;
+    IH_CONDSYNC(V_linkHandle(rwvp));
+
     if (rwfile)
 	STREAM_CLOSE(rwfile);
     if (clfilein)
unchanged:
--- openafs/src/vol/namei_ops.c.o1382	2004-11-09 18:16:40.000000000 +0100
+++ openafs/src/vol/namei_ops.c	2005-06-01 16:40:28.000000000 +0200
@@ -572,6 +572,8 @@
 
     if (p2 == -1 && p3 == VI_LINKTABLE) {
 	/* hack at tmp to setup for set link count call. */
+	memset((void *)&tfd, 0, sizeof(FdHandle_t));	/* minimalistic still, but a little cleaner */
+	tfd.fd_ih = &tmp;
 	tfd.fd_fd = fd;
 	code = namei_SetLinkCount(&tfd, (Inode) 0, 1, 0);
     }
only in patch2:
unchanged:
--- openafs/src/volser/dumpstuff.c.1rig	2005-11-02 12:39:29.000000000 +0100
+++ openafs/src/volser/dumpstuff.c	2006-02-14 14:49:58.000000000 +0100
@@ -1033,7 +1033,13 @@
 
     tdelo = delo;
     while (1) {
-	if (ReadVnodes(iodp, vp, 0, b1, s1, b2, s2, tdelo)) {
+	int temprc;
+
+	V_linkHandle(avp)->ih_flags |= IH_DELAY_SYNC;	/* Avoid repetitive fdsync()s on linkfile */
+	temprc = ReadVnodes(iodp, vp, 0, b1, s1, b2, s2, tdelo);
+	V_linkHandle(avp)->ih_flags &= ~IH_DELAY_SYNC;	/* normal sync behaviour again */
+	IH_CONDSYNC(V_linkHandle(avp));			/* sync link file */
+	if (temprc) {
 	    error = VOLSERREAD_DUMPERROR;
 	    goto clean;
 	}

--------------000402090402030105010209--