LWN.net Logo

Kernel based rename undo

Kernel based rename undo

Posted Apr 10, 2009 7:35 UTC (Fri) by bojan (subscriber, #14302)
In reply to: Kernel based rename undo by butlerm
Parent article: Linux Storage and Filesystem workshop, day 1

I know. What I'm talking about is synchronisation between processes in terms of contents of data (i.e. one process may write a change, which gets lost when another process does the same - your stock race). So, you cannot just open(), write(), close(), rename() with multiple processes. You have to lock, otherwise your processes will stomp all over each other's data.

An example of doing the same with multiple processes when kernel doesn't guarantee data before metadata on rename is below. Bugs included, of course ;-).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <signal.h>
#include <aio.h>

#define BUF_SIZE 50

static int *count=NULL;

/* XXX this is just a demo, no error checking */

static void whack(int signum,siginfo_t *info,void *context){
  int sd=*(int*)info->si_value.sival_ptr;

  /* critical section */
  lockf(sd,F_LOCK,0);

  if(!--(*count))
    unlink("foo~");

  /* end critical section */
  lockf(sd,F_ULOCK,0);
}

int main(int argc,char **argv){
  int sd,fd;
  ssize_t len;
  char buf[BUF_SIZE];
  struct aiocb cb;
  const struct aiocb *cbl[]={&cb};
  struct sigaction act;

  /* AIO control block setup */
  memset(&cb,0,sizeof(cb));
  cb.aio_sigevent.sigev_notify=SIGEV_SIGNAL;
  cb.aio_sigevent.sigev_signo=SIGRTMIN;
  cb.aio_sigevent.sigev_value.sival_ptr=&sd;

  /* signal handler setup */
  memset(&act,0,sizeof(act));
  act.sa_flags=SA_SIGINFO;
  act.sa_sigaction=whack;
  sigaction(SIGRTMIN,&act,NULL);

  /* setup shared counter, restore */
  if((sd=shm_open("foo",O_RDWR|O_CREAT|O_EXCL,S_IRUSR|S_IWUSR))==-1){
    int tries=20;
    struct stat s;

    /* not the first to arrive, open and wait for counter to be written */
    sd=shm_open("foo",O_RDWR,S_IRUSR|S_IWUSR);
    fstat(sd,&s);
    while(tries-- && s.st_size<sizeof(*count)){
      sleep(1);
      fstat(sd,&s);
    }

    /* something's really screwed */
    if(!tries)
      return 1;
  } else{ /* first to arrive, restore */
    int count=0; /* filler */

    /* don't care if we fail */
    if(!rename("foo~","foo"))
      fprintf(stderr,"Restored.\n");

    write(sd,&count,sizeof(count));
  }

  /* shared counter */
  count=mmap(NULL,sizeof(int),PROT_READ|PROT_WRITE,MAP_SHARED,sd,0);

  /* critical section */
  lockf(sd,F_LOCK,0);

  /* don't care if it fails - already there */
  link("foo","foo~");

  /* read existing file */
  fd=open("foo",O_RDONLY);
  len=read(fd,buf,BUF_SIZE);
  close(fd);

  /* write to new file and initiate sync */
  fd=open("foo.new",O_WRONLY|O_CREAT|O_TRUNC,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
  write(fd,buf,len);
  cb.aio_fildes=fd;
  (*count)++;
  aio_fsync(O_SYNC,&cb);
  close(fd);

  /* put the new file in place */
  rename("foo.new","foo");

  /* end critical section */
  lockf(sd,F_ULOCK,0);

  /* do something really useful here */

  /* wait for AIO completion */
  aio_suspend(cbl,1,NULL);

  /* clean up shared memory */
  munmap(count,sizeof(int));
  close(sd);

  return 0;
}


(Log in to post comments)

Kernel based rename undo

Posted Apr 10, 2009 15:34 UTC (Fri) by butlerm (subscriber, #13312) [Link]

If you want multiple writers, you definitely need locking. I was referring
to single writer / multiple readers, which is a far more common situation.

Kernel based rename undo

Posted Apr 11, 2009 10:22 UTC (Sat) by bojan (subscriber, #14302) [Link]

BTW, aio_suspend() has no effect on aio_fsync(). That, for sure, is a bug.

Kernel based rename undo

Posted Apr 11, 2009 12:06 UTC (Sat) by bojan (subscriber, #14302) [Link]

The lockf() would also land in trouble with the mmap().

Kernel based rename undo

Posted Apr 12, 2009 4:52 UTC (Sun) by bojan (subscriber, #14302) [Link]

A more robust version below:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <signal.h>
#include <aio.h>
#include <errno.h>

/* XXX this is just a demo, no error checking */

static int sd=-1;
static int count=0;
static char filler[2]={0,0};

/* locks */
static struct flock 
  fwl={.l_type=F_WRLCK,.l_whence=SEEK_SET,.l_start=0,.l_len=1},
  ful={.l_type=F_UNLCK,.l_whence=SEEK_SET,.l_start=0,.l_len=1},
  bwl={.l_type=F_WRLCK,.l_whence=SEEK_SET,.l_start=1,.l_len=1},
  brl={.l_type=F_RDLCK,.l_whence=SEEK_SET,.l_start=1,.l_len=1},
  bul={.l_type=F_UNLCK,.l_whence=SEEK_SET,.l_start=1,.l_len=1};

static void aiodone(int signum,siginfo_t *info,void *context){
  /* signal counter down */
  count--;
}

#define BUF_SIZE 50

static void config(struct aiocb *cb){
  int fd;
  ssize_t len;
  char buf[BUF_SIZE];

  /* critical section */
  while(fcntl(sd,F_SETLKW,&fwl));

  /* don't care if it fails, any version is OK */
  link("foo","foo~");

  /* read existing file */
  fd=open("foo",O_RDONLY);
  len=read(fd,buf,BUF_SIZE);
  close(fd);

  /* write to new file */
  fd=open("foo.new",O_WRONLY|O_CREAT|O_TRUNC,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
  write(fd,buf,len);

  /* AIO control block setup */
  memset(cb,0,sizeof(*cb));
  cb->aio_sigevent.sigev_notify=SIGEV_SIGNAL;
  cb->aio_sigevent.sigev_signo=SIGRTMIN;
  cb->aio_fildes=fd;

  /* signal counter up */
  count++;

  /* initiate sync and close */
  aio_fsync(O_SYNC,cb);
  close(fd);

  /* put the new file in place */
  rename("foo.new","foo");

  /* end critical section */
  while(fcntl(sd,F_SETLKW,&ful));
}

#define LOOPS 10
#define TRIES 20

int main(int argc,char **argv){
  int i;
  struct aiocb cb[LOOPS];
  struct sigaction act;

  /* setup shared file, restore */
  if((sd=shm_open("foo",O_RDWR|O_CREAT|O_EXCL,S_IRUSR|S_IWUSR))==-1){
    int tries=TRIES;
    struct stat f;

    /* not the first to arrive, open and wait for restore */
    sd=shm_open("foo",O_RDWR,S_IRUSR|S_IWUSR);

    fstat(sd,&f);
    while(tries-- && f.st_size<sizeof(filler)){
      sleep(1);
      fstat(sd,&f);
    }

    /* something's really screwed */
    if(!tries)
      return 1;
  } else{ /* first to arrive, restore */
    /* don't care if we fail */
    if(!rename("foo~","foo"))
      fprintf(stderr,"Restored.\n");

    /* setup lock file */
    write(sd,&filler,sizeof(filler));
  }

  /* signal handler setup */
  memset(&act,0,sizeof(act));
  act.sa_flags=SA_SIGINFO;
  act.sa_sigaction=aiodone;
  sigaction(SIGRTMIN,&act,NULL);

  /* we need the backup file to be there */
  while(fcntl(sd,F_SETLKW,&brl));

  /* program may run config many times */
  for(i=0;i<LOOPS;i++){
    config(&cb[i]);

    /* do something really useful here */
  }

  /* wait for AIO completion */
  while(count)
    sleep(1);

  /* unlock the backup file */
  while(fcntl(sd,F_SETLKW,&bul));

  /* try to remove backup file */
  if(!fcntl(sd,F_SETLK,&fwl)){
    if(!fcntl(sd,F_SETLK,&bwl)){
      unlink("foo~");
      while(fcntl(sd,F_SETLKW,&bul));
    }
    while(fcntl(sd,F_SETLKW,&ful));
  }

  /* clean up shared memory */
  close(sd);

  return 0;
}

Copyright © 2013, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds