Files
linux/fs/ocfs2/extent_map.c

1048 lines
24 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* extent_map.c
*
* Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*/
#include <linux/fs.h>
#include <linux/init.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fiemap.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "symlink.h"
#include "aops.h"
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
/*
* The extent caching implementation is intentionally trivial.
*
* We only cache a small number of extents stored directly on the
* inode, so linear order operations are acceptable. If we ever want
* to increase the size of the extent map, then these algorithms must
* get smarter.
*/
void ocfs2_extent_map_init(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
oi->ip_extent_map.em_num_items = 0;
INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
}
static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
unsigned int cpos,
struct ocfs2_extent_map_item **ret_emi)
{
unsigned int range;
struct ocfs2_extent_map_item *emi;
*ret_emi = NULL;
list_for_each_entry(emi, &em->em_list, ei_list) {
range = emi->ei_cpos + emi->ei_clusters;
if (cpos >= emi->ei_cpos && cpos < range) {
list_move(&emi->ei_list, &em->em_list);
*ret_emi = emi;
break;
}
}
}
static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
unsigned int *phys, unsigned int *len,
unsigned int *flags)
{
unsigned int coff;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map_item *emi;
spin_lock(&oi->ip_lock);
__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
if (emi) {
coff = cpos - emi->ei_cpos;
*phys = emi->ei_phys + coff;
if (len)
*len = emi->ei_clusters - coff;
if (flags)
*flags = emi->ei_flags;
}
spin_unlock(&oi->ip_lock);
if (emi == NULL)
return -ENOENT;
return 0;
}
/*
* Forget about all clusters equal to or greater than cpos.
*/
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
struct ocfs2_extent_map_item *emi, *n;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map *em = &oi->ip_extent_map;
LIST_HEAD(tmp_list);
unsigned int range;
spin_lock(&oi->ip_lock);
list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
if (emi->ei_cpos >= cpos) {
/* Full truncate of this record. */
list_move(&emi->ei_list, &tmp_list);
BUG_ON(em->em_num_items == 0);
em->em_num_items--;
continue;
}
range = emi->ei_cpos + emi->ei_clusters;
if (range > cpos) {
/* Partial truncate */
emi->ei_clusters = cpos - emi->ei_cpos;
}
}
spin_unlock(&oi->ip_lock);
list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
list_del(&emi->ei_list);
kfree(emi);
}
}
/*
* Is any part of emi2 contained within emi1
*/
static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
struct ocfs2_extent_map_item *emi2)
{
unsigned int range1, range2;
/*
* Check if logical start of emi2 is inside emi1
*/
range1 = emi1->ei_cpos + emi1->ei_clusters;
if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
return 1;
/*
* Check if logical end of emi2 is inside emi1
*/
range2 = emi2->ei_cpos + emi2->ei_clusters;
if (range2 > emi1->ei_cpos && range2 <= range1)
return 1;
return 0;
}
static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
struct ocfs2_extent_map_item *src)
{
dest->ei_cpos = src->ei_cpos;
dest->ei_phys = src->ei_phys;
dest->ei_clusters = src->ei_clusters;
dest->ei_flags = src->ei_flags;
}
/*
* Try to merge emi with ins. Returns 1 if merge succeeds, zero
* otherwise.
*/
static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
struct ocfs2_extent_map_item *ins)
{
/*
* Handle contiguousness
*/
if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
ins->ei_flags == emi->ei_flags) {
emi->ei_clusters += ins->ei_clusters;
return 1;
} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
(ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
ins->ei_flags == emi->ei_flags) {
emi->ei_phys = ins->ei_phys;
emi->ei_cpos = ins->ei_cpos;
emi->ei_clusters += ins->ei_clusters;
return 1;
}
/*
* Overlapping extents - this shouldn't happen unless we've
* split an extent to change it's flags. That is exceedingly
* rare, so there's no sense in trying to optimize it yet.
*/
if (ocfs2_ei_is_contained(emi, ins) ||
ocfs2_ei_is_contained(ins, emi)) {
ocfs2_copy_emi_fields(emi, ins);
return 1;
}
/* No merge was possible. */
return 0;
}
/*
* In order to reduce complexity on the caller, this insert function
* is intentionally liberal in what it will accept.
*
* The only rule is that the truncate call *must* be used whenever
* records have been deleted. This avoids inserting overlapping
* records with different physical mappings.
*/
void ocfs2_extent_map_insert_rec(struct inode *inode,
struct ocfs2_extent_rec *rec)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map *em = &oi->ip_extent_map;
struct ocfs2_extent_map_item *emi, *new_emi = NULL;
struct ocfs2_extent_map_item ins;
ins.ei_cpos = le32_to_cpu(rec->e_cpos);
ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
ins.ei_flags = rec->e_flags;
search:
spin_lock(&oi->ip_lock);
list_for_each_entry(emi, &em->em_list, ei_list) {
if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
list_move(&emi->ei_list, &em->em_list);
spin_unlock(&oi->ip_lock);
goto out;
}
}
/*
* No item could be merged.
*
* Either allocate and add a new item, or overwrite the last recently
* inserted.
*/
if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
if (new_emi == NULL) {
spin_unlock(&oi->ip_lock);
new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
if (new_emi == NULL)
goto out;
goto search;
}
ocfs2_copy_emi_fields(new_emi, &ins);
list_add(&new_emi->ei_list, &em->em_list);
em->em_num_items++;
new_emi = NULL;
} else {
BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
emi = list_entry(em->em_list.prev,
struct ocfs2_extent_map_item, ei_list);
list_move(&emi->ei_list, &em->em_list);
ocfs2_copy_emi_fields(emi, &ins);
}
spin_unlock(&oi->ip_lock);
out:
kfree(new_emi);
}
static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_dinode *di)
{
int ret, next_free;
u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in leaf block %llu\n",
inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0 ||
(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
ret = 1;
out:
brelse(eb_bh);
return ret;
}
/*
* Return the 1st index within el which contains an extent start
* larger than v_cluster.
*/
static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
u32 v_cluster)
{
int i;
struct ocfs2_extent_rec *rec;
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
if (v_cluster < le32_to_cpu(rec->e_cpos))
break;
}
return i;
}
/*
* Figure out the size of a hole which starts at v_cluster within the given
* extent list.
*
* If there is no more allocation past v_cluster, we return the maximum
* cluster size minus v_cluster.
*
* If we have in-inode extents, then el points to the dinode list and
* eb_bh is NULL. Otherwise, eb_bh should point to the extent block
* containing el.
*/
int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *el,
struct buffer_head *eb_bh,
u32 v_cluster,
u32 *num_clusters)
{
int ret, i;
struct buffer_head *next_eb_bh = NULL;
struct ocfs2_extent_block *eb, *next_eb;
i = ocfs2_search_for_hole_index(el, v_cluster);
if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
/*
* Check the next leaf for any extents.
*/
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
ret = ocfs2_read_extent_block(ci,
le64_to_cpu(eb->h_next_leaf_blk),
&next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
el = &next_eb->h_list;
i = ocfs2_search_for_hole_index(el, v_cluster);
}
no_more_extents:
if (i == le16_to_cpu(el->l_next_free_rec)) {
/*
* We're at the end of our existing allocation. Just
* return the maximum number of clusters we could
* possibly allocate.
*/
*num_clusters = UINT_MAX - v_cluster;
} else {
*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
}
ret = 0;
out:
brelse(next_eb_bh);
return ret;
}
static int ocfs2_get_clusters_nocache(struct inode *inode,
struct buffer_head *di_bh,
u32 v_cluster, unsigned int *hole_len,
struct ocfs2_extent_rec *ret_rec,
unsigned int *is_last)
{
int i, ret, tree_height, len;
struct ocfs2_dinode *di;
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-03 13:09:38 -07:00
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct buffer_head *eb_bh = NULL;
memset(ret_rec, 0, sizeof(*ret_rec));
if (is_last)
*is_last = 0;
di = (struct ocfs2_dinode *) di_bh->b_data;
el = &di->id2.i_list;
tree_height = le16_to_cpu(el->l_tree_depth);
if (tree_height > 0) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
&eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in leaf block %llu\n",
inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
}
if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) {
ocfs2_error(inode->i_sb,
"Inode %lu has an invalid extent (next_free_rec %u, count %u)\n",
inode->i_ino,
le16_to_cpu(el->l_next_free_rec),
le16_to_cpu(el->l_count));
ret = -EROFS;
goto out;
}
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
* Holes can be larger than the maximum size of an
* extent, so we return their lengths in a separate
* field.
*/
if (hole_len) {
ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
el, eb_bh,
v_cluster, &len);
if (ret) {
mlog_errno(ret);
goto out;
}
*hole_len = len;
}
goto out_hole;
}
rec = &el->l_recs[i];
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb,
"Inode %lu has bad extent record (%u, %u, 0)\n",
inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
*ret_rec = *rec;
/*
* Checking for last extent is potentially expensive - we
* might have to look at the next leaf over to see if it's
* empty.
*
* The first two checks are to see whether the caller even
* cares for this information, and if the extent is at least
* the last in it's list.
*
* If those hold true, then the extent is last if any of the
* additional conditions hold true:
* - Extent list is in-inode
* - Extent list is right-most
* - Extent list is 2nd to rightmost, with empty right-most
*/
if (is_last) {
if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
if (tree_height == 0)
*is_last = 1;
else if (eb->h_blkno == di->i_last_eb_blk)
*is_last = 1;
else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
ret = ocfs2_last_eb_is_empty(inode, di);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
if (ret == 1)
*is_last = 1;
}
}
}
out_hole:
ret = 0;
out:
brelse(eb_bh);
return ret;
}
static void ocfs2_relative_extent_offsets(struct super_block *sb,
u32 v_cluster,
struct ocfs2_extent_rec *rec,
u32 *p_cluster, u32 *num_clusters)
{
u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
}
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
unsigned int *extent_flags)
{
int ret = 0, i;
struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_rec *rec;
u32 coff;
if (el->l_tree_depth) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
&eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in xattr leaf block %llu\n",
inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
}
}
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
ret = -EROFS;
mlog_errno(ret);
goto out;
} else {
rec = &el->l_recs[i];
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb,
"Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
coff = v_cluster - le32_to_cpu(rec->e_cpos);
*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
if (extent_flags)
*extent_flags = rec->e_flags;
}
out:
brelse(eb_bh);
return ret;
}
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
unsigned int *extent_flags)
{
int ret;
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-03 13:09:38 -07:00
unsigned int hole_len, flags = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ret = -ERANGE;
mlog_errno(ret);
goto out;
}
ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
num_clusters, extent_flags);
if (ret == 0)
goto out;
ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
&rec, NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
if (rec.e_blkno == 0ULL) {
/*
* A hole was found. Return some canned values that
* callers can key on. If asked for, num_clusters will
* be populated with the size of the hole.
*/
*p_cluster = 0;
if (num_clusters) {
*num_clusters = hole_len;
}
} else {
ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
p_cluster, num_clusters);
flags = rec.e_flags;
ocfs2_extent_map_insert_rec(inode, &rec);
}
if (extent_flags)
*extent_flags = flags;
out:
brelse(di_bh);
return ret;
}
/*
* This expects alloc_sem to be held. The allocation cannot change at
* all while the map is in the process of being updated.
*/
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags)
{
int ret;
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 cpos, num_clusters, p_cluster;
u64 boff = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
extent_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* p_cluster == 0 indicates a hole.
*/
if (p_cluster) {
boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
boff += (v_blkno & (u64)(bpc - 1));
}
*p_blkno = boff;
if (ret_count) {
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
*ret_count -= v_blkno & (u64)(bpc - 1);
}
out:
return ret;
}
/*
* The ocfs2_fiemap_inline() may be a little bit misleading, since
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
*
* Must be called with ip_alloc_sem semaphore held.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
u64 map_start)
{
int ret;
unsigned int id_count;
struct ocfs2_dinode *di;
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
lockdep_assert_held_read(&oi->ip_alloc_sem);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
id_count = ocfs2_fast_symlink_chars(inode->i_sb);
else
id_count = le16_to_cpu(di->id2.i_data.id_count);
if (map_start < id_count) {
phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
if (ocfs2_inode_is_fast_symlink(inode))
phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
else
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
/* Release the ip_alloc_sem to prevent deadlock on page fault */
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0)
return ret;
}
return 0;
}
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len)
{
int ret, is_last;
u32 mapping_end, cpos;
unsigned int hole_size;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u64 len_bytes, phys_bytes, virt_bytes;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0);
if (ret)
return ret;
ret = ocfs2_inode_lock(inode, &di_bh, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
/*
* Handle inline-data and fast symlink separately.
*/
if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
ocfs2_inode_is_fast_symlink(inode)) {
ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
goto out_unlock;
}
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
is_last = 0;
while (cpos < mapping_end && !is_last) {
u32 fe_flags;
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
cpos += hole_size;
continue;
}
fe_flags = 0;
if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
fe_flags |= FIEMAP_EXTENT_SHARED;
if (is_last)
fe_flags |= FIEMAP_EXTENT_LAST;
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
/* Release the ip_alloc_sem to prevent deadlock on page fault */
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 <f3> a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely <mark.tinguely@oracle.com> Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-08-29 10:18:15 -05:00
down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret)
break;
cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
}
if (ret > 0)
ret = 0;
out_unlock:
brelse(di_bh);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 0);
out:
return ret;
}
/* Is IO overwriting allocated blocks? */
int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
u64 map_start, u64 map_len)
{
int ret = 0, is_last;
u32 mapping_end, cpos;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_rec rec;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len))
return ret;
else
return -EAGAIN;
}
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
is_last = 0;
while (cpos < mapping_end && !is_last) {
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
NULL, &rec, &is_last);
if (ret) {
mlog_errno(ret);
goto out;
}
if (rec.e_blkno == 0ULL)
break;
if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
break;
cpos = le32_to_cpu(rec.e_cpos) +
le16_to_cpu(rec.e_leaf_clusters);
}
if (cpos < mapping_end)
ret = -EAGAIN;
out:
return ret;
}
int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret;
unsigned int is_last = 0, is_data = 0;
u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
u32 cpos, cend, clen, hole_size;
u64 extoff, extlen;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
ret = ocfs2_inode_lock(inode, &di_bh, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (*offset >= i_size_read(inode)) {
ret = -ENXIO;
goto out_unlock;
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
if (whence == SEEK_HOLE)
*offset = i_size_read(inode);
goto out_unlock;
}
clen = 0;
cpos = *offset >> cs_bits;
cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
while (cpos < cend && !is_last) {
ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
&rec, &is_last);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
extoff = cpos;
extoff <<= cs_bits;
if (rec.e_blkno == 0ULL) {
clen = hole_size;
is_data = 0;
} else {
clen = le16_to_cpu(rec.e_leaf_clusters) -
(cpos - le32_to_cpu(rec.e_cpos));
is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
}
if ((!is_data && whence == SEEK_HOLE) ||
(is_data && whence == SEEK_DATA)) {
if (extoff > *offset)
*offset = extoff;
goto out_unlock;
}
if (!is_last)
cpos += clen;
}
if (whence == SEEK_HOLE) {
extoff = cpos;
extoff <<= cs_bits;
extlen = clen;
extlen <<= cs_bits;
if ((extoff + extlen) > i_size_read(inode))
extlen = i_size_read(inode) - extoff;
extoff += extlen;
if (extoff > *offset)
*offset = extoff;
goto out_unlock;
}
ret = -ENXIO;
out_unlock:
brelse(di_bh);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 0);
out:
return ret;
}
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
{
int rc = 0;
u64 p_block, p_count;
int i, count, done = 0;
trace_ocfs2_read_virt_blocks(
inode, (unsigned long long)v_block, nr, bhs, flags,
validate);
if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
i_size_read(inode)) {
BUG_ON(!(flags & OCFS2_BH_READAHEAD));
goto out;
}
while (done < nr) {
if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
rc = -EAGAIN;
mlog(ML_ERROR,
"Inode #%llu ip_alloc_sem is temporarily unavailable\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
break;
}
rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
&p_block, &p_count, NULL);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (rc) {
mlog_errno(rc);
break;
}
if (!p_block) {
rc = -EIO;
mlog(ML_ERROR,
"Inode #%llu contains a hole at offset %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)(v_block + done) <<
inode->i_sb->s_blocksize_bits);
break;
}
count = nr - done;
if (p_count < count)
count = p_count;
/*
* If the caller passed us bhs, they should have come
* from a previous readahead call to this function. Thus,
* they should have the right b_blocknr.
*/
for (i = 0; i < count; i++) {
if (!bhs[done + i])
continue;
BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
}
rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
bhs + done, flags, validate);
if (rc) {
mlog_errno(rc);
break;
}
done += count;
}
out:
return rc;
}