[RFC][PATCH 0/5] Add netlink file system notification interface

August 18th, 2011 - 08:20 am ET by Lukas Czerner | Report spam
Hello.

this is my proposal to add netlink notification interface which can be used
by file system to send various warning messages to user space via netlink.
This is actually the same what quota uses to send warnings about exceeding
quota and it can be eventually merged into this interface so we have just
one netlink messaging system for file systems.

The first PATCH adds the netlink interface itself and the rest of the
patches wire it up into the file systems (ext3,ext4,xfs,btrfs). So far it
can only send information about data and metadata ENOSPC, but it can be
easily extended, for example for using quota notification.


Here is a description of the first patch:

There might be a lot of crazy things happening inside the file systems
but it might result in bogus error code returned to the user space. And
sometimes it is hard to figure out what just happened. This
commit adds the interface which can be used by file systems to send
better information to the user space via netlink interface, because it
is not bound with error codes.

Also applications might not report problems with the file systems
correctly, hence the administrator will never know about the problem
unless it is already too late. Also in the case of ENOSPC conditions
even if we are checking 'df' output from cronjob, we might miss some
ENOSPC states because it just takes snapshots of the state. Those are
just examples.

With this interface file system can send a message via netlink interface
at the moment when the problem arises.

In order to use this file system must register the netlink
interface with init_fs_nl_family() on module initialization and the in
can send messages with fs_nl_send_warning().

At this point there are only two types of warning FS_NL_ENOSPC_WARN,
which should be used in situations when file system does not have enough
space to reserve data blocks and FS_NL_META_ENOSPC_WARN, for situations
when file system does not have enough space to reserve metadata blocks.
But more can be added in the future.

The code has been based on fs/quota/netlink.c which is used to send
quota warnings to the user space. Eventually it can be merged into this
interface.

I have tested this with a simple tool which is testing various enospc
situations and with fallocate. You can find the tool bellow (alloc_test)

For the user space to receive the messages I have written a simple tool
(based on quota_nld code), you can find it bellow (fsmfg).

Thanks!
-Lukas


[PATCH 1/5] fs: add netlink notification interface
[PATCH 2/5] ext3: use fs netlink interface for ENOSPC conditions
[PATCH 3/5] ext4: use fs netlink interface for ENOSPC conditions
[PATCH 4/5] xfs: use fs netlink interface for ENOSPC conditions
[PATCH 5/5] btrfs: use fs netlink interface for ENOSPC conditions

fs/Makefile | 2 +-
fs/btrfs/extent-tree.c | 13 ++++-
fs/btrfs/super.c | 1 +
fs/ext3/acl.c | 5 +-
fs/ext3/balloc.c | 10 +++-
fs/ext3/inode.c | 4 +-
fs/ext3/namei.c | 10 ++--
fs/ext3/super.c | 1 +
fs/ext3/xattr.c | 2 +-
fs/ext4/acl.c | 5 +-
fs/ext4/balloc.c | 15 ++++--
fs/ext4/ext4.h | 3 +-
fs/ext4/extents.c | 2 +-
fs/ext4/indirect.c | 2 +-
fs/ext4/inode.c | 10 ++--
fs/ext4/namei.c | 10 ++--
fs/ext4/super.c | 1 +
fs/ext4/xattr.c | 2 +-
fs/netlink.c | 107 ++++++++++++++++++++++++++++++++++++++++++
fs/xfs/linux-2.6/xfs_file.c | 2 +
fs/xfs/linux-2.6/xfs_super.c | 1 +
fs/xfs/xfs_vnodeops.c | 10 +++-
include/linux/ext3_fs.h | 3 +-
include/linux/fs.h | 11 ++++
24 files changed, 193 insertions(+), 39 deletions(-)



alloc_test.c

#define _GNU_SOURCE

#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>
#include <linux/types.h>


#define BSIZE 4096
#define FILECOUNT 9000000

unsigned long long
do_write(int fd, char *data, int size)
{
unsigned long long count = 0, len = 0;
while (1) {
len = write(fd, data, size);
if (errno) {
perror("errno");
break;
}
count += len;
}
fsync(fd);
return count;
}

void write_test(char *filename, int flag)
{
int err, fd, ps = getpagesize();
unsigned long long count = 0;
void *data;

fd = open(filename, flag );
if (fd < 0) {
perror("open");
exit(1);
}

err = posix_memalign(&data, ps, BSIZE);
if (err) {
perror("posix_memalign");
exit(1);
}
memset(data, 0, BSIZE);
count = do_write(fd, data, BSIZE);
printf("%llu Bytes written", count);
close(fd);

remove(filename);
sync();
errno = 0;
}

void dir_create(char *dir, long count)
{
int fd, err = 0;
char fname[256];
long i = count;

printf("0/%ld", count);
while (i) {
sprintf(fname, "%s/testfile_%ld", dir, i);
fd = mkdir(fname, 0);
if (fd < 0) {
perror("open");
err = 1;
goto cleanup;
}
close(fd);
i--;
if (!(i % 10000))
printf("%ld/%ld", count -i, count);
}
sync();
cleanup:
printf("%ld files created in the directory %s", count -i, dir);
while (i < count) {
i++;
sprintf(fname, "%s/testfile_%ld", dir, i);
fd = rmdir(fname);
if (fd < 0) {
printf("name %s", fname);
perror("unlink");
exit(1);
}
if (!(i % 10000))
printf("%ld/%ld", count -i, count);
}
printf("");
sync();
}

void file_create(char *dir, long count)
{
int fd, err = 0;
char fname[256];
long i = count;

printf("0/%ld", count);
while (i > 0) {
sprintf(fname, "%s/testfile_%ld", dir, i);
fd = creat(fname, 0);
if (fd < 0) {
perror("creat");
err = 1;
goto cleanup;
}
close(fd);
i--;
if (!(i % 10000))
printf("%ld/%ld", count -i, count);
}
sync();
cleanup:
printf("%ld files created in the directory %s", count -i, dir);
while (i < count) {
i++;
sprintf(fname, "%s/testfile_%ld", dir, i);
fd = unlink(fname);
if (fd < 0) {
printf("name %s", fname);
perror("unlink");
exit(1);
}
if (!(i % 10000))
printf("%ld/%ld", count -i, count);
}
printf("");
sync();
}

void test_all(char *dir)
{
char fname[256];

sprintf(fname, "%s/testfile", dir);

printf("[+] Buffered write test");
write_test(fname, O_RDWR | O_CREAT | O_TRUNC);
write_test(fname, O_RDWR | O_CREAT | O_TRUNC);

printf("[+] Direct write test");
write_test(fname, O_RDWR | O_CREAT | O_TRUNC | O_DIRECT);
write_test(fname, O_RDWR | O_CREAT | O_TRUNC | O_DIRECT);

printf("[+] File creation test");
file_create(dir, FILECOUNT);
file_create(dir, FILECOUNT);

printf("[+] Directory creation test");
dir_create(dir, FILECOUNT);
dir_create(dir, FILECOUNT);
}

int main(int argc, char **argv)
{
int type;
long count = FILECOUNT;
char fname[256];

if (argc < 2) {
printf("Usage: %s <filename> n", argv[0]);
printf("n=1 buffered writen=2 direct writen=3 file creation");
exit(1);
}


if (argc == 2) {
test_all(argv[1]);
return 0;
}

sprintf(fname, "%s/testfile", argv[1]);
type = atoi(argv[2]);
switch (type) {
case 1:
printf("[+] Buffered write test");
write_test(fname, O_RDWR | O_CREAT | O_TRUNC);
break;
case 2:
printf("[+] Direct write test");
write_test(fname, O_RDWR | O_CREAT | O_TRUNC | O_DIRECT);
break;
case 3:
printf("[+] File creation test");
if (argc == 4) {
count = atol(argv[3]);
}
file_create(argv[1], count);
break;
case 4:
printf("[+] Directory creation test");
if (argc == 4) {
count = atol(argv[3]);
}
dir_create(argv[1], count);
break;
default:
printf("Type not recognised");
exit(1);
}
return 0;
}

fsmsg.c
-
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
#include <utmp.h>
#include <errno.h>
#include <string.h>
#include <fcntl.h>
#include <limits.h>
#include <inttypes.h>

#include <netlink/genl/genl.h>
#include <netlink/genl/ctrl.h>

char *progname;

/*
* Definition for fs netlink interface
*/
#define FS_NL_NOWARN 0
#define FS_NL_ENOSPC_WARN 1
#define FS_NL_META_ENOSPC_WARN 2

enum {
FS_NL_A_UNSPEC,
FS_NL_A_WARNING,
FS_NL_A_DEV_MAJOR,
FS_NL_A_DEV_MINOR,
FS_NL_A_CAUSED_ID,
__FS_NL_A_MAX,
};
#define FS_NL_A_MAX (__FS_NL_A_MAX - 1)

enum {
FS_NL_C_UNSPEC,
FS_NL_C_WARNING,
__FS_NL_C_MAX,
};
#define FS_NL_C_MAX (__FS_NL_C_MAX - 1)

static const struct option options[] = {
{ "help", 0, NULL, 'h' },
{ "no-daemon", 0, NULL, 'F' },
{ NULL, 0, NULL, 0 }
};

static struct nla_policy fs_nl_warn_cmd_policy[FS_NL_A_MAX+1] = {
[FS_NL_A_WARNING] = { .type = NLA_U32 },
[FS_NL_A_DEV_MAJOR] = { .type = NLA_U32 },
[FS_NL_A_DEV_MINOR] = { .type = NLA_U32 },
[FS_NL_A_CAUSED_ID] = { .type = NLA_U64 },
};

/* User options */
#define FL_NODAEMON 1

int flags;

void show_help(void)
{
printf("Usage: %s [options]Options are:\
-h --help shows this text\
-F --foreground run daemon in foreground", progname);
}

void die(int err, const char *string)
{
fprintf(stderr, "fsmsg: %s", string);
exit(err);
}

static void parse_options(int argc, char **argv)
{
int opt;

while ((opt = getopt_long(argc, argv, "VhDCFb", options, NULL)) >= 0) {
switch (opt) {
case 'h':
show_help();
exit(0);
case 'F':
flags |= FL_NODAEMON;
break;
default:
printf("Unknown option '%c'.", opt);
show_help();
exit(1);
}
}
}

static int fs_nl_parser(struct nl_msg *msg, void *arg)
{
struct nlmsghdr *nlh = nlmsg_hdr(msg);
struct genlmsghdr *ghdr;
struct nlattr *attrs[FS_NL_A_MAX+1];
int ret, warntype;
char *warn_msg;

if (!genlmsg_valid_hdr(nlh, 0))
return 0;
ghdr = nlmsg_data(nlh);
if (ghdr->cmd != FS_NL_C_WARNING)
return 0;

ret = genlmsg_parse(nlh, 0, attrs, FS_NL_A_MAX, fs_nl_warn_cmd_policy);
if (ret < 0) {
printf("Error parsing netlink message.");
return ret;
}
if (!attrs[FS_NL_A_WARNING] ||
!attrs[FS_NL_A_DEV_MAJOR] || !attrs[FS_NL_A_DEV_MAJOR] ||
!attrs[FS_NL_A_DEV_MINOR] || !attrs[FS_NL_A_CAUSED_ID]) {
printf("Unknown format of kernel netlink message!"
"Maybe your fsmsg is too old?");
return -EINVAL;
}
warntype = nla_get_u32(attrs[FS_NL_A_WARNING]);

switch (warntype) {
case FS_NL_ENOSPC_WARN:
warn_msg = "no space left on the file system";
break;
case FS_NL_META_ENOSPC_WARN:
warn_msg = "not enough space left for metadata";
break;
default:
warn_msg = "unknown file system warning";
}

printf("VFS: on device (%u:%u) from UID=%" PRIu64 " %s.",
nla_get_u32(attrs[FS_NL_A_DEV_MAJOR]),
nla_get_u32(attrs[FS_NL_A_DEV_MINOR]),
nla_get_u64(attrs[FS_NL_A_CAUSED_ID]),
warn_msg);

return 0;
}

static struct nl_handle *init_netlink(void)
{
struct nl_handle *handle;
int ret, family;

handle = nl_handle_alloc();
if (!handle)
die(2, "Cannot allocate netlink handle!");
nl_disable_sequence_check(handle);
ret = genl_connect(handle);
if (ret < 0)
die(2, "Cannot connect to netlink socket");
family = genl_ctrl_resolve(handle, "FS_MSG");
if (ret < 0)
die(2, "Cannot resolve fs netlink name");

ret = nl_socket_add_membership(handle, family);
if (ret < 0)
die(2, "Cannot join fs multicast group");

ret = nl_socket_modify_cb(handle, NL_CB_VALID, NL_CB_CUSTOM,
fs_nl_parser, NULL);
if (ret < 0)
die(2, "Cannot register callback for"
" netlink messages");

return handle;
}

static void run(struct nl_handle *nhandle)
{
int ret;

while (1) {
ret = nl_recvmsgs_default(nhandle);
if (ret < 0)
printf("Failed to read or parse fs netlink"
" message: %s", strerror(-ret));
}
}

int main(int argc, char **argv)
{
struct nl_handle *nhandle;

progname = basename(argv[0]);
parse_options(argc, argv);

nhandle = init_netlink();
if (!(flags & FL_NODAEMON)) {
daemon(0, 0);
}
run(nhandle);
return 0;
}
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
email Follow the discussionReplies 11 repliesReplies Make a reply

Replies

#1 Lukas Czerner
August 18th, 2011 - 08:20 am ET | Report spam
Register fs netlink interface and send proper warning if ENOSPC is
encountered. Note that we differentiate between enospc for metadata and
enospc for data.

Also fix ext4_should_retry_alloc() so we do not check for free blocks
when we are actually allocating metadata.

Signed-off-by: Lukas Czerner
CC: Theodore Ts'o
CC: Ext4 Developers List

fs/ext4/acl.c | 5 +++--
fs/ext4/balloc.c | 15 +++++++++++-
fs/ext4/ext4.h | 3 ++-
fs/ext4/extents.c | 2 +-
fs/ext4/indirect.c | 2 +-
fs/ext4/inode.c | 10 +++++--
fs/ext4/namei.c | 10 +++++--
fs/ext4/super.c | 1 +
fs/ext4/xattr.c | 2 +-
9 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a5c29bb..cc3e72c 100644
a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -321,7 +321,7 @@ retry:
error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
ext4_journal_stop(handle);
if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ ext4_should_retry_alloc(inode->i_sb, &retries, 0))
goto retry;
out:
posix_acl_release(acl);
@@ -414,7 +414,8 @@ retry:
return PTR_ERR(handle);
error = ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
- if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
+ &retries, 0))
goto retry;

release_and_out:
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f8224ad..42daae7 100644
a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -426,12 +426,19 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*
* if the total number of retries exceed three times, return FALSE.
*/
-int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+int ext4_should_retry_alloc(struct super_block *sb, int *retries, int data)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
- (*retries)++ > 3 ||
- !EXT4_SB(sb)->s_journal)
+ if ((data && !ext4_has_free_blocks(EXT4_SB(sb), 1, 0)) ||
+ ++(*retries) >= 3 ||
+ !EXT4_SB(sb)->s_journal) {
+ if (data)
+ fs_nl_send_warning(sb->s_dev, FS_NL_ENOSPC_WARN);
+ else
+ fs_nl_send_warning(sb->s_dev, FS_NL_META_ENOSPC_WARN);
return 0;
+ }
+
+

jbd_debug(1, "%s: retrying operation after ENOSPC", sb->s_id);

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e717dfd..45cb981 100644
a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1741,7 +1741,8 @@ extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
-extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries,
+ int data);
struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57cf568..1c8cb04 100644
a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3834,7 +3834,7 @@ retry:
break;
}
if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ext4_should_retry_alloc(inode->i_sb, &retries, 1)) {
ret = 0;
goto retry;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8602cd..40153bc 100644
a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -817,7 +817,7 @@ retry:
ext4_truncate_failed_write(inode);
}
}
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries, 1))
goto retry;

if (orphan) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d47264c..d0596f3 100644
a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -814,7 +814,7 @@ retry:
}
}

- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries, 1))
goto retry;
out:
return ret;
@@ -1067,7 +1067,7 @@ repeat:
*/
if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
dquot_release_reservation_block(inode, 1);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ if (ext4_should_retry_alloc(inode->i_sb, &retries, 1)) {
yield();
goto repeat;
}
@@ -2291,7 +2291,7 @@ retry:
ext4_truncate_failed_write(inode);
}

- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries, 1))
goto retry;
out:
return ret;
@@ -4349,7 +4349,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = __block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries));
+ ext4_should_retry_alloc(inode->i_sb, &retries, 1));
goto out_ret;
}

@@ -4402,7 +4402,7 @@ retry_alloc:
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries, 1))
goto retry_alloc;
out_ret:
ret = block_page_mkwrite_return(ret);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f8068c7..5a2fe5e 100644
a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1764,7 +1764,7 @@ retry:
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries, 0))
goto retry;
return err;
}
@@ -1801,7 +1801,7 @@ retry:
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries, 0))
goto retry;
return err;
}
@@ -1886,7 +1886,7 @@ out_clear_inode:
out_stop:
brelse(dir_block);
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries, 0))
goto retry;
return err;
}
@@ -2333,7 +2333,7 @@ retry:
err = ext4_add_nondir(handle, dentry, inode);
out_stop:
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries, 0))
goto retry;
return err;
err_drop_inode:
@@ -2376,7 +2376,7 @@ retry:
iput(inode);
}
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries, 0))
goto retry;
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4687fea..83b7ccd 100644
a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5011,6 +5011,7 @@ static int __init ext4_init_fs(void)

ext4_li_info = NULL;
mutex_init(&ext4_li_mtx);
+ init_fs_nl_family();
return 0;
out:
unregister_as_ext2();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc..a3e381b 100644
a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1094,7 +1094,7 @@ retry:
value, value_len, flags);
error2 = ext4_journal_stop(handle);
if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ ext4_should_retry_alloc(inode->i_sb, &retries, 0))
goto retry;
if (error == 0)
error = error2;
1.7.4.4

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Similar topics