1/* extent-scan.c -- core functions for scanning extents
2   Copyright (C) 2010-2015 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation, either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17   Written by Jie Liu (jeff.liu@oracle.com).  */
18
19#include <config.h>
20#include <sys/types.h>
21#include <sys/ioctl.h>
22#include <sys/utsname.h>
23#include <assert.h>
24
25#include "system.h"
26#include "extent-scan.h"
27#include "fiemap.h"
28#include "xstrtol.h"
29
30
31/* Work around Linux kernel issues on BTRFS and EXT4.  */
32static bool
33extent_need_sync (void)
34{
35  /* For now always return true, to be on the safe side.
36     If/when FIEMAP semantics are well defined (before SEEK_HOLE support
37     is usable) and kernels implementing them are in use, we may relax
38     this once again.  */
39  return true;
40
41#if FIEMAP_BEHAVIOR_IS_DEFINED_AND_USABLE
42  static int need_sync = -1;
43
44  if (need_sync == -1)
45    {
46      struct utsname name;
47      need_sync = 0; /* No workaround by default.  */
48
49# ifdef __linux__
50      if (uname (&name) != -1 && STRNCMP_LIT (name.release, "2.6.") == 0)
51        {
52           unsigned long val;
53           if (xstrtoul (name.release + 4, NULL, 10, &val, NULL) == LONGINT_OK)
54             {
55               if (val < 39)
56                 need_sync = 1;
57             }
58        }
59# endif
60    }
61
62  return need_sync;
63#endif
64}
65
66/* Allocate space for struct extent_scan, initialize the entries if
67   necessary and return it as the input argument of extent_scan_read().  */
68extern void
69extent_scan_init (int src_fd, struct extent_scan *scan)
70{
71  scan->fd = src_fd;
72  scan->ei_count = 0;
73  scan->ext_info = NULL;
74  scan->scan_start = 0;
75  scan->initial_scan_failed = false;
76  scan->hit_final_extent = false;
77  scan->fm_flags = extent_need_sync () ? FIEMAP_FLAG_SYNC : 0;
78}
79
80#ifdef __linux__
81# ifndef FS_IOC_FIEMAP
82#  define FS_IOC_FIEMAP _IOWR ('f', 11, struct fiemap)
83# endif
84/* Call ioctl(2) with FS_IOC_FIEMAP (available in linux 2.6.27) to
85   obtain a map of file extents excluding holes.  */
86extern bool
87extent_scan_read (struct extent_scan *scan)
88{
89  unsigned int si = 0;
90  struct extent_info *last_ei = scan->ext_info;
91
92  while (true)
93    {
94      union { struct fiemap f; char c[4096]; } fiemap_buf;
95      struct fiemap *fiemap = &fiemap_buf.f;
96      struct fiemap_extent *fm_extents = &fiemap->fm_extents[0];
97      enum { count = (sizeof fiemap_buf - sizeof *fiemap)/sizeof *fm_extents };
98      verify (count > 1);
99
100      /* This is required at least to initialize fiemap->fm_start,
101         but also serves (in mid 2010) to appease valgrind, which
102         appears not to know the semantics of the FIEMAP ioctl. */
103      memset (&fiemap_buf, 0, sizeof fiemap_buf);
104
105      fiemap->fm_start = scan->scan_start;
106      fiemap->fm_flags = scan->fm_flags;
107      fiemap->fm_extent_count = count;
108      fiemap->fm_length = FIEMAP_MAX_OFFSET - scan->scan_start;
109
110      /* Fall back to the standard copy if call ioctl(2) failed for
111         the first time.  */
112      if (ioctl (scan->fd, FS_IOC_FIEMAP, fiemap) < 0)
113        {
114          if (scan->scan_start == 0)
115            scan->initial_scan_failed = true;
116          return false;
117        }
118
119      /* If 0 extents are returned, then no more scans are needed.  */
120      if (fiemap->fm_mapped_extents == 0)
121        {
122          scan->hit_final_extent = true;
123          return scan->scan_start != 0;
124        }
125
126      assert (scan->ei_count <= SIZE_MAX - fiemap->fm_mapped_extents);
127      scan->ei_count += fiemap->fm_mapped_extents;
128      {
129        /* last_ei points into a buffer that may be freed via xnrealloc.
130           Record its offset and adjust after allocation.  */
131        size_t prev_idx = last_ei - scan->ext_info;
132        scan->ext_info = xnrealloc (scan->ext_info, scan->ei_count,
133                                    sizeof (struct extent_info));
134        last_ei = scan->ext_info + prev_idx;
135      }
136
137      unsigned int i = 0;
138      for (i = 0; i < fiemap->fm_mapped_extents; i++)
139        {
140          assert (fm_extents[i].fe_logical
141                  <= OFF_T_MAX - fm_extents[i].fe_length);
142
143          verify (sizeof last_ei->ext_flags >= sizeof fm_extents->fe_flags);
144
145          if (si && last_ei->ext_flags
146              == (fm_extents[i].fe_flags & ~FIEMAP_EXTENT_LAST)
147              && (last_ei->ext_logical + last_ei->ext_length
148                  == fm_extents[i].fe_logical))
149            {
150              /* Merge previous with last.  */
151              last_ei->ext_length += fm_extents[i].fe_length;
152              /* Copy flags in case different.  */
153              last_ei->ext_flags = fm_extents[i].fe_flags;
154            }
155          else if ((si == 0 && scan->scan_start > fm_extents[i].fe_logical)
156                   || (si && (last_ei->ext_logical + last_ei->ext_length
157                              > fm_extents[i].fe_logical)))
158            {
159              /* BTRFS before 2.6.38 could return overlapping extents
160                 for sparse files.  We adjust the returned extents
161                 rather than failing, as otherwise it would be inefficient
162                 to detect this on the initial scan.  */
163              uint64_t new_logical;
164              uint64_t length_adjust;
165              if (si == 0)
166                new_logical = scan->scan_start;
167              else
168                {
169                  /* We could return here if scan->scan_start == 0
170                     but don't so as to minimize special cases.  */
171                  new_logical = last_ei->ext_logical + last_ei->ext_length;
172                }
173              length_adjust = new_logical - fm_extents[i].fe_logical;
174              /* If an extent is contained within the previous one, fail.  */
175              if (length_adjust < fm_extents[i].fe_length)
176                {
177                  if (scan->scan_start == 0)
178                    scan->initial_scan_failed = true;
179                  return false;
180                }
181              fm_extents[i].fe_logical = new_logical;
182              fm_extents[i].fe_length -= length_adjust;
183              /* Process the adjusted extent again.  */
184              i--;
185              continue;
186            }
187          else
188            {
189              last_ei = scan->ext_info + si;
190              last_ei->ext_logical = fm_extents[i].fe_logical;
191              last_ei->ext_length = fm_extents[i].fe_length;
192              last_ei->ext_flags = fm_extents[i].fe_flags;
193              si++;
194            }
195        }
196
197      if (last_ei->ext_flags & FIEMAP_EXTENT_LAST)
198        scan->hit_final_extent = true;
199
200      /* If we have enough extents, discard the last as it might
201         be merged with one from the next scan.  */
202      if (si > count && !scan->hit_final_extent)
203        last_ei = scan->ext_info + --si - 1;
204
205      /* We don't bother reallocating any trailing slots.  */
206      scan->ei_count = si;
207
208      if (scan->hit_final_extent)
209        break;
210      else
211        scan->scan_start = last_ei->ext_logical + last_ei->ext_length;
212
213      if (si >= count)
214        break;
215    }
216
217  return true;
218}
219#else
220extern bool
221extent_scan_read (struct extent_scan *scan _GL_UNUSED)
222{
223  scan->initial_scan_failed = true;
224  errno = ENOTSUP;
225  return false;
226}
227#endif
228