btrfs: scrub: update device stats when an error is detected

[ Upstream commit ec1f3a207cdf314eae4d4ae145f1ffdb829f0652 ]

[BUG]
Since the migration to the new scrub_stripe interface, scrub no longer
updates the device stats when hitting an error, no matter if it's a read
or checksum mismatch error. E.g:

  BTRFS info (device dm-2): scrub: started on devid 1
  BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488
  BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file)
  BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488
  BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file)
  BTRFS info (device dm-2): scrub: finished on devid 1 with status: 0

Note there is no line showing the device stats error update.

[CAUSE]
In the migration to the new scrub_stripe interface, we no longer call
btrfs_dev_stat_inc_and_print().

[FIX]
- Introduce a new bitmap for metadata generation errors
  * A new bitmap
    @meta_gen_error_bitmap is introduced to record which blocks have
    metadata generation mismatch errors.

  * A new counter for that bitmap
    @init_nr_meta_gen_errors, is also introduced to store the number of
    generation mismatch errors that are found during the initial read.

    This is for the error reporting at scrub_stripe_report_errors().

  * New dedicated error message for unrepaired generation mismatches

  * Update @meta_gen_error_bitmap if a transid mismatch is hit

- Add btrfs_dev_stat_inc_and_print() calls to the following call sites
  * scrub_stripe_report_errors()
  * scrub_write_endio()
    This is only for the write errors.

This means there is a minor behavior change:

- The timing of device stats error message
  Since we concentrate the error messages at
  scrub_stripe_report_errors(), the device stats error messages will all
  show up in one go, after the detailed scrub error messages:

   BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488
   BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file)
   BTRFS error (device dm-2): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488
   BTRFS warning (device dm-2): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file)
   BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 1, gen 0
   BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 2, gen 0

Fixes: e02ee89baa ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
Qu Wenruo
2025-05-01 08:37:54 +09:30
committed by Greg Kroah-Hartman
parent 3cf4d9cae4
commit 7cfb9086b6

View File

@@ -153,12 +153,14 @@ struct scrub_stripe {
unsigned int init_nr_io_errors; unsigned int init_nr_io_errors;
unsigned int init_nr_csum_errors; unsigned int init_nr_csum_errors;
unsigned int init_nr_meta_errors; unsigned int init_nr_meta_errors;
unsigned int init_nr_meta_gen_errors;
/* /*
* The following error bitmaps are all for the current status. * The following error bitmaps are all for the current status.
* Every time we submit a new read, these bitmaps may be updated. * Every time we submit a new read, these bitmaps may be updated.
* *
* error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; * error_bitmap = io_error_bitmap | csum_error_bitmap |
* meta_error_bitmap | meta_generation_bitmap;
* *
* IO and csum errors can happen for both metadata and data. * IO and csum errors can happen for both metadata and data.
*/ */
@@ -166,6 +168,7 @@ struct scrub_stripe {
unsigned long io_error_bitmap; unsigned long io_error_bitmap;
unsigned long csum_error_bitmap; unsigned long csum_error_bitmap;
unsigned long meta_error_bitmap; unsigned long meta_error_bitmap;
unsigned long meta_gen_error_bitmap;
/* For writeback (repair or replace) error reporting. */ /* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap; unsigned long write_error_bitmap;
@@ -673,7 +676,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
} }
if (stripe->sectors[sector_nr].generation != if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) { btrfs_stack_header_generation(header)) {
bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info, btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad generation, has %llu want %llu", "tree block %llu mirror %u has bad generation, has %llu want %llu",
@@ -685,6 +688,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree);
} }
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -973,8 +977,22 @@ skip:
if (__ratelimit(&rs) && dev) if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false, scrub_print_common_warning("header error", dev, false,
stripe->logical, physical); stripe->logical, physical);
if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("generation error", dev, false,
stripe->logical, physical);
} }
/* Update the device stats. */
for (int i = 0; i < stripe->init_nr_io_errors; i++)
btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
for (int i = 0; i < stripe->init_nr_csum_errors; i++)
btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
/* Generation mismatch error is based on each metadata, not each block. */
for (int i = 0; i < stripe->init_nr_meta_gen_errors;
i += (fs_info->nodesize >> fs_info->sectorsize_bits))
btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
spin_lock(&sctx->stat_lock); spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
@@ -983,7 +1001,8 @@ skip:
sctx->stat.no_csum += nr_nodatacsum_sectors; sctx->stat.no_csum += nr_nodatacsum_sectors;
sctx->stat.read_errors += stripe->init_nr_io_errors; sctx->stat.read_errors += stripe->init_nr_io_errors;
sctx->stat.csum_errors += stripe->init_nr_csum_errors; sctx->stat.csum_errors += stripe->init_nr_csum_errors;
sctx->stat.verify_errors += stripe->init_nr_meta_errors; sctx->stat.verify_errors += stripe->init_nr_meta_errors +
stripe->init_nr_meta_gen_errors;
sctx->stat.uncorrectable_errors += sctx->stat.uncorrectable_errors +=
bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors; sctx->stat.corrected_errors += nr_repaired_sectors;
@@ -1029,6 +1048,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
stripe->nr_sectors); stripe->nr_sectors);
stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
stripe->nr_sectors); stripe->nr_sectors);
stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap,
stripe->nr_sectors);
if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
goto out; goto out;
@@ -1143,6 +1164,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr, bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits); bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags); spin_unlock_irqrestore(&stripe->write_error_lock, flags);
for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
btrfs_dev_stat_inc_and_print(stripe->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
} }
bio_put(&bbio->bio); bio_put(&bbio->bio);
@@ -1505,10 +1529,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
stripe->init_nr_io_errors = 0; stripe->init_nr_io_errors = 0;
stripe->init_nr_csum_errors = 0; stripe->init_nr_csum_errors = 0;
stripe->init_nr_meta_errors = 0; stripe->init_nr_meta_errors = 0;
stripe->init_nr_meta_gen_errors = 0;
stripe->error_bitmap = 0; stripe->error_bitmap = 0;
stripe->io_error_bitmap = 0; stripe->io_error_bitmap = 0;
stripe->csum_error_bitmap = 0; stripe->csum_error_bitmap = 0;
stripe->meta_error_bitmap = 0; stripe->meta_error_bitmap = 0;
stripe->meta_gen_error_bitmap = 0;
} }
/* /*