add preliminary 2.6.32 support
[openwrt.git] / target / linux / generic-2.6 / patches-2.6.32 / 230-union_mounts.patch
1 --- /dev/null
2 +++ b/Documentation/filesystems/union-mounts.txt
3 @@ -0,0 +1,187 @@
4 +VFS based Union Mounts
5 +----------------------
6 +
7 + 1. What are "Union Mounts"
8 + 2. The Union Stack
9 + 3. Whiteouts, Opaque Directories, and Fallthrus
10 + 4. Copy-up
11 + 5. Directory Reading
12 + 6. Known Problems
13 + 7. References
14 +
15 +-------------------------------------------------------------------------------
16 +
17 +1. What are "Union Mounts"
18 +==========================
19 +
20 +Please note: this is NOT about UnionFS and it is NOT derived work!
21 +
22 +Traditionally the mount operation is opaque, which means that the content of
23 +the mount point, the directory where the file system is mounted on, is hidden
24 +by the content of the mounted file system's root directory until the file
25 +system is unmounted again. Unlike the traditional UNIX mount mechanism, that
26 +hides the contents of the mount point, a union mount presents a view as if
27 +both filesystems are merged together. Although only the topmost layer of the
28 +mount stack can be altered, it appears as if transparent file system mounts
29 +allow any file to be created, modified or deleted.
30 +
31 +Most people know the concepts and features of union mounts from other
32 +operating systems like Sun's Translucent Filesystem, Plan9 or BSD. For an
33 +in-depth review of union mounts and other unioning file systems, see:
34 +
35 +http://lwn.net/Articles/324291/
36 +http://lwn.net/Articles/325369/
37 +http://lwn.net/Articles/327738/
38 +
39 +Here are the key features of this implementation:
40 +- completely VFS based
41 +- does not change the namespace stacking
42 +- directory listings have duplicate entries removed in the kernel
43 +- writable unions: only the topmost file system layer may be writable
44 +- writable unions: new whiteout filetype handled inside the kernel
45 +
46 +-------------------------------------------------------------------------------
47 +
48 +2. The Union Stack
49 +==================
50 +
51 +The mounted file systems are organized in the "file system hierarchy" (tree of
52 +vfsmount structures), which keeps track about the stacking of file systems
53 +upon each other. The per-directory view on the file system hierarchy is called
54 +"mount stack" and reflects the order of file systems, which are mounted on a
55 +specific directory.
56 +
57 +Union mounts present a single unified view of the contents of two or more file
58 +systems as if they are merged together. Since the information which file
59 +system objects are part of a unified view is not directly available from the
60 +file system hierarchy there is a need for a new structure. The file system
61 +objects, which are part of a unified view are ordered in a so-called "union
62 +stack". Only directories can be part of a unified view.
63 +
64 +The link between two layers of the union stack is maintained using the
65 +union_mount structure (#include <linux/union.h>):
66 +
67 +struct union_mount {
68 + atomic_t u_count; /* reference count */
69 + struct mutex u_mutex;
70 + struct list_head u_unions; /* list head for d_unions */
71 + struct hlist_node u_hash; /* list head for searching */
72 + struct hlist_node u_rhash; /* list head for reverse searching */
73 +
74 + struct path u_this; /* this is me */
75 + struct path u_next; /* this is what I overlay */
76 +};
77 +
78 +The union_mount structure holds a reference (dget,mntget) to the next lower
79 +layer of the union stack. Since a dentry can be part of multiple unions
80 +(e.g. with bind mounts) they are tied together via the d_unions field of the
81 +dentry structure.
82 +
83 +All union_mount structures are cached in two hash tables, one for lookups of
84 +the next lower layer of the union stack and one for reverse lookups of the
85 +next upper layer of the union stack. The reverse lookup is necessary to
86 +resolve CWD relative path lookups. For calculation of the hash value, the
87 +(dentry,vfsmount) pair is used. The u_this field is used for the hash table
88 +which is used in forward lookups and the u_next field for the reverse lookups.
89 +
90 +During every new mount (or mount propagation), a new union_mount structure is
91 +allocated. A reference to the mountpoint's vfsmount and dentry is taken and
92 +stored in the u_next field. In almost the same manner an union_mount
93 +structure is created during the first time lookup of a directory within a
94 +union mount point. In this case the lookup proceeds to all lower layers of the
95 +union. Therefore the complete union stack is constructed during lookups.
96 +
97 +The union_mount structures of a dentry are destroyed when the dentry itself is
98 +destroyed. Therefore the dentry cache is indirectly driving the union_mount
99 +cache like this is done for inodes too. Please note that lower layer
100 +union_mount structures are kept in memory until the topmost dentry is
101 +destroyed.
102 +
103 +-------------------------------------------------------------------------------
104 +
105 +3. Whiteouts, Opaque Directories, and Fallthrus
106 +===========================================================
107 +
108 +The whiteout filetype isn't new. It has been there for quite some time now
109 +but Linux's VFS hasn't used it yet. With the availability of union mount code
110 +inside the VFS the whiteout filetype is getting important to support writable
111 +union mounts. For read-only union mounts, support for whiteouts or
112 +copy-on-open is not necessary.
113 +
114 +The whiteout filetype has the same function as negative dentries: they
115 +describe a filename which isn't there. The creation of whiteouts needs
116 +lowlevel filesystem support. At the time of writing this, there is whiteout
117 +support for tmpfs, ext2 and ext3 available. The VFS is extended to make the
118 +whiteout handling transparent to all its users. The whiteouts are not
119 +visible to user-space.
120 +
121 +What happens when we create a directory that was previously whited-out? We
122 +don't want the directory entries from underlying filesystems to suddenly appear
123 +in the newly created directory. So we mark the directory opaque (the file
124 +system must support storage of the opaque flag).
125 +
126 +Fallthrus are directory entries that override the opaque flag on a directory
127 +for that specific directory entry name (the lookup "falls through" to the next
128 +layer of the union mount). Fallthrus are mainly useful for implementing
129 +readdir().
130 +
131 +-------------------------------------------------------------------------------
132 +
133 +4. Copy-up
134 +===========
135 +
136 +Any write to an object on any layer other than the topmost triggers a copy-up
137 +of the object to the topmost file system. For regular files, the copy-up
138 +happens when it is opened in writable mode.
139 +
140 +Directories are copied up on open, regardless of intent to write, to simplify
141 +copy-up of any object located below it in the namespace. Otherwise we have to
142 +walk the entire pathname to create intermediate directories whenever we do a
143 +copy-up. This is the same approach as BSD union mounts and uses a negigible
144 +amount of disk space. Note that the actual directory entries themselves are
145 +not copied-up from the lower levels until (a) the directory is written to, or
146 +(b) the first readdir() of the directory (more on that later).
147 +
148 +Rename across different levels of the union is implemented as a copy-up
149 +operation for regular files. Rename of directories simply returns EXDEV, the
150 +same as if we tried to rename across different mounts. Most applications have
151 +to handle this case anyway. Some applications do not expect EXDEV on
152 +rename operations within the same directory, but these applications will also
153 +be broken with bind mounts.
154 +
155 +-------------------------------------------------------------------------------
156 +
157 +5. Directory Reading
158 +====================
159 +
160 +readdir() is somewhat difficult to implement in a unioning file system. We must
161 +eliminate duplicates, apply whiteouts, and start up readdir() where we left
162 +off, given a single f_pos value. Our solution is to copy up all the directory
163 +entries to the topmost directory the first time readdir() is called on a
164 +directory. During this copy-up, we skip duplicates and entries covered by
165 +whiteouts, and then create fallthru entries for each remaining visible dentry.
166 +Then we mark the whole directory opaque. From then on, we just use the topmost
167 +file system's normal readdir() operation.
168 +
169 +-------------------------------------------------------------------------------
170 +
171 +6. Known Problems
172 +=================
173 +
174 +- copyup() for other filetypes that reg and dir (e.g. for chown() on devices)
175 +- symlinks are untested
176 +
177 +-------------------------------------------------------------------------------
178 +
179 +7. References
180 +=============
181 +
182 +[1] http://marc.info/?l=linux-fsdevel&m=96035682927821&w=2
183 +[2] http://marc.info/?l=linux-fsdevel&m=117681527820133&w=2
184 +[3] http://marc.info/?l=linux-fsdevel&m=117913503200362&w=2
185 +[4] http://marc.info/?l=linux-fsdevel&m=118231827024394&w=2
186 +
187 +Authors:
188 +Jan Blunck <jblunck@suse.de>
189 +Bharata B Rao <bharata@linux.vnet.ibm.com>
190 +Valerie Aurora <vaurora@redhat.com>
191 --- a/fs/autofs4/autofs_i.h
192 +++ b/fs/autofs4/autofs_i.h
193 @@ -130,6 +130,7 @@ struct autofs_sb_info {
194 int reghost_enabled;
195 int needs_reghost;
196 struct super_block *sb;
197 + struct vfsmount *mnt;
198 struct mutex wq_mutex;
199 spinlock_t fs_lock;
200 struct autofs_wait_queue *queues; /* Wait queue pointer */
201 --- a/fs/autofs4/init.c
202 +++ b/fs/autofs4/init.c
203 @@ -17,7 +17,16 @@
204 static int autofs_get_sb(struct file_system_type *fs_type,
205 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
206 {
207 - return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
208 + struct autofs_sb_info *sbi;
209 + int ret;
210 +
211 + ret = get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
212 + if (ret)
213 + return ret;
214 +
215 + sbi = autofs4_sbi(mnt->mnt_sb);
216 + sbi->mnt = mnt;
217 + return 0;
218 }
219
220 static struct file_system_type autofs_fs_type = {
221 --- a/fs/autofs4/root.c
222 +++ b/fs/autofs4/root.c
223 @@ -179,6 +179,12 @@ static void *autofs4_follow_link(struct
224 DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
225 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
226 nd->flags);
227 +
228 + dput(nd->path.dentry);
229 + mntput(nd->path.mnt);
230 + nd->path.mnt = mntget(sbi->mnt);
231 + nd->path.dentry = dget(dentry);
232 +
233 /*
234 * For an expire of a covered direct or offset mount we need
235 * to break out of follow_down() at the autofs mount trigger
236 --- a/fs/compat.c
237 +++ b/fs/compat.c
238 @@ -840,6 +840,9 @@ static int compat_fillonedir(void *__buf
239 struct compat_old_linux_dirent __user *dirent;
240 compat_ulong_t d_ino;
241
242 + if (d_type == DT_WHT)
243 + return 0;
244 +
245 if (buf->result)
246 return -EINVAL;
247 d_ino = ino;
248 @@ -911,6 +914,9 @@ static int compat_filldir(void *__buf, c
249 compat_ulong_t d_ino;
250 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
251
252 + if (d_type == DT_WHT)
253 + return 0;
254 +
255 buf->error = -EINVAL; /* only used if we fail.. */
256 if (reclen > buf->count)
257 return -EINVAL;
258 @@ -1000,6 +1006,9 @@ static int compat_filldir64(void * __buf
259 int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
260 u64 off;
261
262 + if (d_type == DT_WHT)
263 + return 0;
264 +
265 buf->error = -EINVAL; /* only used if we fail.. */
266 if (reclen > buf->count)
267 return -EINVAL;
268 --- a/fs/dcache.c
269 +++ b/fs/dcache.c
270 @@ -18,6 +18,7 @@
271 #include <linux/string.h>
272 #include <linux/mm.h>
273 #include <linux/fs.h>
274 +#include <linux/union.h>
275 #include <linux/fsnotify.h>
276 #include <linux/slab.h>
277 #include <linux/init.h>
278 @@ -158,14 +159,19 @@ static void dentry_lru_del_init(struct d
279 }
280
281 /**
282 - * d_kill - kill dentry and return parent
283 + * __d_kill - kill dentry and return parent
284 * @dentry: dentry to kill
285 + * @list: kill list
286 + * @greedy: return parent instead of putting it on the kill list
287 *
288 * The dentry must already be unhashed and removed from the LRU.
289 *
290 - * If this is the root of the dentry tree, return NULL.
291 + * If this is the root of the dentry tree, return NULL. If greedy is zero, we
292 + * put the parent of this dentry on the kill list instead. The callers must
293 + * make sure that __d_kill_final() is called on all dentries on the kill list.
294 */
295 -static struct dentry *d_kill(struct dentry *dentry)
296 +static struct dentry *__d_kill(struct dentry *dentry, struct list_head *list,
297 + int greedy)
298 __releases(dentry->d_lock)
299 __releases(dcache_lock)
300 {
301 @@ -173,13 +179,78 @@ static struct dentry *d_kill(struct dent
302
303 list_del(&dentry->d_u.d_child);
304 dentry_stat.nr_dentry--; /* For d_free, below */
305 - /*drops the locks, at that point nobody can reach this dentry */
306 +
307 + /*
308 + * If we are not greedy we just put this on a list for later processing
309 + * (follow up to parent, releasing of inode and freeing dentry memory).
310 + */
311 + if (!greedy) {
312 + list_del_init(&dentry->d_alias);
313 + /* at this point nobody can reach this dentry */
314 + list_add(&dentry->d_lru, list);
315 + spin_unlock(&dentry->d_lock);
316 + spin_unlock(&dcache_lock);
317 + __shrink_d_unions(dentry, list);
318 + return NULL;
319 + }
320 +
321 + /* drops the locks, at that point nobody can reach this dentry */
322 dentry_iput(dentry);
323 + /* If the dentry was in an union delete them */
324 + __shrink_d_unions(dentry, list);
325 + if (IS_ROOT(dentry))
326 + parent = NULL;
327 + else
328 + parent = dentry->d_parent;
329 + d_free(dentry);
330 + return parent;
331 +}
332 +
333 +void __dput(struct dentry *, struct list_head *, int);
334 +
335 +static void __d_kill_final(struct dentry *dentry, struct list_head *list)
336 +{
337 + struct dentry *parent;
338 + struct inode *inode = dentry->d_inode;
339 +
340 + if (inode) {
341 + dentry->d_inode = NULL;
342 + if (!inode->i_nlink)
343 + fsnotify_inoderemove(inode);
344 + if (dentry->d_op && dentry->d_op->d_iput)
345 + dentry->d_op->d_iput(dentry, inode);
346 + else
347 + iput(inode);
348 + }
349 +
350 if (IS_ROOT(dentry))
351 parent = NULL;
352 else
353 parent = dentry->d_parent;
354 d_free(dentry);
355 + __dput(parent, list, 1);
356 +}
357 +
358 +/**
359 + * d_kill - kill dentry and return parent
360 + * @dentry: dentry to kill
361 + *
362 + * The dentry must already be unhashed and removed from the LRU.
363 + *
364 + * If this is the root of the dentry tree, return NULL.
365 + */
366 +static struct dentry *d_kill(struct dentry *dentry)
367 +{
368 + LIST_HEAD(mortuary);
369 + struct dentry *parent;
370 +
371 + parent = __d_kill(dentry, &mortuary, 1);
372 + while (!list_empty(&mortuary)) {
373 + dentry = list_entry(mortuary.next, struct dentry, d_lru);
374 + list_del(&dentry->d_lru);
375 + __d_kill_final(dentry, &mortuary);
376 + }
377 +
378 return parent;
379 }
380
381 @@ -200,19 +271,24 @@ static struct dentry *d_kill(struct dent
382 * Real recursion would eat up our stack space.
383 */
384
385 -/*
386 - * dput - release a dentry
387 - * @dentry: dentry to release
388 +/**
389 + * __dput - release a dentry
390 + * @dentry: dentry to release
391 + * @list: kill list argument for __d_kill()
392 + * @greedy: greedy argument for __d_kill()
393 *
394 * Release a dentry. This will drop the usage count and if appropriate
395 * call the dentry unlink method as well as removing it from the queues and
396 * releasing its resources. If the parent dentries were scheduled for release
397 - * they too may now get deleted.
398 + * they too may now get deleted if @greedy is not zero. Otherwise parent is
399 + * added to the kill list. The callers must make sure that __d_kill_final() is
400 + * called on all dentries on the kill list.
401 + *
402 + * You probably want to use dput() instead.
403 *
404 * no dcache lock, please.
405 */
406 -
407 -void dput(struct dentry *dentry)
408 +void __dput(struct dentry *dentry, struct list_head *list, int greedy)
409 {
410 if (!dentry)
411 return;
412 @@ -253,12 +329,35 @@ unhash_it:
413 kill_it:
414 /* if dentry was on the d_lru list delete it from there */
415 dentry_lru_del(dentry);
416 - dentry = d_kill(dentry);
417 + dentry = __d_kill(dentry, list, greedy);
418 if (dentry)
419 goto repeat;
420 }
421
422 /**
423 + * dput - release a dentry
424 + * @dentry: dentry to release
425 + *
426 + * Release a dentry. This will drop the usage count and if appropriate
427 + * call the dentry unlink method as well as removing it from the queues and
428 + * releasing its resources. If the parent dentries were scheduled for release
429 + * they too may now get deleted.
430 + *
431 + * no dcache lock, please.
432 + */
433 +void dput(struct dentry *dentry)
434 +{
435 + LIST_HEAD(mortuary);
436 +
437 + __dput(dentry, &mortuary, 1);
438 + while (!list_empty(&mortuary)) {
439 + dentry = list_entry(mortuary.next, struct dentry, d_lru);
440 + list_del(&dentry->d_lru);
441 + __d_kill_final(dentry, &mortuary);
442 + }
443 +}
444 +
445 +/**
446 * d_invalidate - invalidate a dentry
447 * @dentry: dentry to invalidate
448 *
449 @@ -690,6 +789,7 @@ static void shrink_dcache_for_umount_sub
450 iput(inode);
451 }
452
453 + shrink_d_unions(dentry);
454 d_free(dentry);
455
456 /* finished when we fall off the top of the tree,
457 @@ -952,6 +1052,10 @@ struct dentry *d_alloc(struct dentry * p
458 INIT_LIST_HEAD(&dentry->d_lru);
459 INIT_LIST_HEAD(&dentry->d_subdirs);
460 INIT_LIST_HEAD(&dentry->d_alias);
461 +#ifdef CONFIG_UNION_MOUNT
462 + INIT_LIST_HEAD(&dentry->d_unions);
463 + dentry->d_unionized = 0;
464 +#endif
465
466 if (parent) {
467 dentry->d_parent = dget(parent);
468 @@ -982,8 +1086,10 @@ struct dentry *d_alloc_name(struct dentr
469 /* the caller must hold dcache_lock */
470 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
471 {
472 - if (inode)
473 + if (inode) {
474 + dentry->d_flags &= ~(DCACHE_WHITEOUT|DCACHE_FALLTHRU);
475 list_add(&dentry->d_alias, &inode->i_dentry);
476 + }
477 dentry->d_inode = inode;
478 fsnotify_d_instantiate(dentry, inode);
479 }
480 @@ -1514,7 +1620,9 @@ void d_delete(struct dentry * dentry)
481 spin_lock(&dentry->d_lock);
482 isdir = S_ISDIR(dentry->d_inode->i_mode);
483 if (atomic_read(&dentry->d_count) == 1) {
484 + __d_drop_unions(dentry);
485 dentry_iput(dentry);
486 + shrink_d_unions(dentry);
487 fsnotify_nameremove(dentry, isdir);
488 return;
489 }
490 @@ -1525,14 +1633,14 @@ void d_delete(struct dentry * dentry)
491 spin_unlock(&dentry->d_lock);
492 spin_unlock(&dcache_lock);
493
494 + shrink_d_unions(dentry);
495 fsnotify_nameremove(dentry, isdir);
496 }
497
498 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
499 {
500 -
501 - entry->d_flags &= ~DCACHE_UNHASHED;
502 - hlist_add_head_rcu(&entry->d_hash, list);
503 + entry->d_flags &= ~DCACHE_UNHASHED;
504 + hlist_add_head_rcu(&entry->d_hash, list);
505 }
506
507 static void _d_rehash(struct dentry * entry)
508 @@ -1551,6 +1659,7 @@ void d_rehash(struct dentry * entry)
509 {
510 spin_lock(&dcache_lock);
511 spin_lock(&entry->d_lock);
512 + BUG_ON(!d_unhashed(entry));
513 _d_rehash(entry);
514 spin_unlock(&entry->d_lock);
515 spin_unlock(&dcache_lock);
516 @@ -2183,7 +2292,9 @@ resume:
517 struct list_head *tmp = next;
518 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
519 next = tmp->next;
520 - if (d_unhashed(dentry)||!dentry->d_inode)
521 + if (d_unhashed(dentry)||(!dentry->d_inode &&
522 + !d_is_whiteout(dentry) &&
523 + !d_is_fallthru(dentry)))
524 continue;
525 if (!list_empty(&dentry->d_subdirs)) {
526 this_parent = dentry;
527 --- a/fs/ext2/dir.c
528 +++ b/fs/ext2/dir.c
529 @@ -219,7 +219,8 @@ static inline int ext2_match (int len, c
530 {
531 if (len != de->name_len)
532 return 0;
533 - if (!de->inode)
534 + if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
535 + (de->file_type != EXT2_FT_FALLTHRU)))
536 return 0;
537 return !memcmp(name, de->name, len);
538 }
539 @@ -255,6 +256,8 @@ static unsigned char ext2_filetype_table
540 [EXT2_FT_FIFO] = DT_FIFO,
541 [EXT2_FT_SOCK] = DT_SOCK,
542 [EXT2_FT_SYMLINK] = DT_LNK,
543 + [EXT2_FT_WHT] = DT_WHT,
544 + [EXT2_FT_FALLTHRU] = DT_UNKNOWN,
545 };
546
547 #define S_SHIFT 12
548 @@ -341,6 +344,18 @@ ext2_readdir (struct file * filp, void *
549 ext2_put_page(page);
550 return 0;
551 }
552 + } else if (de->file_type == EXT2_FT_FALLTHRU) {
553 + int over;
554 + unsigned char d_type = DT_UNKNOWN;
555 +
556 + offset = (char *)de - kaddr;
557 + over = filldir(dirent, de->name, de->name_len,
558 + (n<<PAGE_CACHE_SHIFT) | offset,
559 + 123, d_type);
560 + if (over) {
561 + ext2_put_page(page);
562 + return 0;
563 + }
564 }
565 filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
566 }
567 @@ -448,6 +463,30 @@ ino_t ext2_inode_by_name(struct inode *d
568 return res;
569 }
570
571 +/* Special version for filetype based whiteout support */
572 +ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
573 +{
574 + ino_t res = 0;
575 + struct ext2_dir_entry_2 *de;
576 + struct page *page;
577 +
578 + de = ext2_find_entry (dir, &dentry->d_name, &page);
579 + if (de) {
580 + res = le32_to_cpu(de->inode);
581 + if (!res && de->file_type == EXT2_FT_WHT) {
582 + spin_lock(&dentry->d_lock);
583 + dentry->d_flags |= DCACHE_WHITEOUT;
584 + spin_unlock(&dentry->d_lock);
585 + } else if(!res && de->file_type == EXT2_FT_FALLTHRU) {
586 + spin_lock(&dentry->d_lock);
587 + dentry->d_flags |= DCACHE_FALLTHRU;
588 + spin_unlock(&dentry->d_lock);
589 + }
590 + ext2_put_page(page);
591 + }
592 + return res;
593 +}
594 +
595 /* Releases the page */
596 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
597 struct page *page, struct inode *inode, int update_times)
598 @@ -472,9 +511,10 @@ void ext2_set_link(struct inode *dir, st
599 }
600
601 /*
602 - * Parent is locked.
603 + * Find or append a given dentry to the parent directory
604 */
605 -int ext2_add_link (struct dentry *dentry, struct inode *inode)
606 +static ext2_dirent * ext2_append_entry(struct dentry * dentry,
607 + struct page ** page)
608 {
609 struct inode *dir = dentry->d_parent->d_inode;
610 const char *name = dentry->d_name.name;
611 @@ -482,13 +522,10 @@ int ext2_add_link (struct dentry *dentry
612 unsigned chunk_size = ext2_chunk_size(dir);
613 unsigned reclen = EXT2_DIR_REC_LEN(namelen);
614 unsigned short rec_len, name_len;
615 - struct page *page = NULL;
616 - ext2_dirent * de;
617 + ext2_dirent * de = NULL;
618 unsigned long npages = dir_pages(dir);
619 unsigned long n;
620 char *kaddr;
621 - loff_t pos;
622 - int err;
623
624 /*
625 * We take care of directory expansion in the same loop.
626 @@ -498,55 +535,97 @@ int ext2_add_link (struct dentry *dentry
627 for (n = 0; n <= npages; n++) {
628 char *dir_end;
629
630 - page = ext2_get_page(dir, n, 0);
631 - err = PTR_ERR(page);
632 - if (IS_ERR(page))
633 + *page = ext2_get_page(dir, n, 0);
634 + de = ERR_PTR(PTR_ERR(*page));
635 + if (IS_ERR(*page))
636 goto out;
637 - lock_page(page);
638 - kaddr = page_address(page);
639 + lock_page(*page);
640 + kaddr = page_address(*page);
641 dir_end = kaddr + ext2_last_byte(dir, n);
642 de = (ext2_dirent *)kaddr;
643 kaddr += PAGE_CACHE_SIZE - reclen;
644 while ((char *)de <= kaddr) {
645 if ((char *)de == dir_end) {
646 /* We hit i_size */
647 - name_len = 0;
648 - rec_len = chunk_size;
649 + de->name_len = 0;
650 de->rec_len = ext2_rec_len_to_disk(chunk_size);
651 de->inode = 0;
652 + de->file_type = 0;
653 goto got_it;
654 }
655 if (de->rec_len == 0) {
656 ext2_error(dir->i_sb, __func__,
657 "zero-length directory entry");
658 - err = -EIO;
659 + de = ERR_PTR(-EIO);
660 goto out_unlock;
661 }
662 - err = -EEXIST;
663 if (ext2_match (namelen, name, de))
664 - goto out_unlock;
665 + goto got_it;
666 name_len = EXT2_DIR_REC_LEN(de->name_len);
667 rec_len = ext2_rec_len_from_disk(de->rec_len);
668 - if (!de->inode && rec_len >= reclen)
669 + if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
670 + (de->file_type != EXT2_FT_FALLTHRU) &&
671 + (rec_len >= reclen))
672 goto got_it;
673 if (rec_len >= name_len + reclen)
674 goto got_it;
675 de = (ext2_dirent *) ((char *) de + rec_len);
676 }
677 - unlock_page(page);
678 - ext2_put_page(page);
679 + unlock_page(*page);
680 + ext2_put_page(*page);
681 }
682 +
683 BUG();
684 - return -EINVAL;
685
686 got_it:
687 + return de;
688 + /* OFFSET_CACHE */
689 +out_unlock:
690 + unlock_page(*page);
691 + ext2_put_page(*page);
692 +out:
693 + return de;
694 +}
695 +
696 +/*
697 + * Parent is locked.
698 + */
699 +int ext2_add_link (struct dentry *dentry, struct inode *inode)
700 +{
701 + struct inode *dir = dentry->d_parent->d_inode;
702 + const char *name = dentry->d_name.name;
703 + int namelen = dentry->d_name.len;
704 + unsigned short rec_len, name_len;
705 + ext2_dirent * de;
706 + struct page *page;
707 + loff_t pos;
708 + int err;
709 +
710 + de = ext2_append_entry(dentry, &page);
711 + if (IS_ERR(de))
712 + return PTR_ERR(de);
713 +
714 + err = -EEXIST;
715 + if (ext2_match (namelen, name, de)) {
716 + if ((de->file_type == EXT2_FT_WHT) ||
717 + (de->file_type == EXT2_FT_FALLTHRU))
718 + goto got_it;
719 + goto out_unlock;
720 + }
721 +
722 +got_it:
723 + name_len = EXT2_DIR_REC_LEN(de->name_len);
724 + rec_len = ext2_rec_len_from_disk(de->rec_len);
725 +
726 pos = page_offset(page) +
727 (char*)de - (char*)page_address(page);
728 err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
729 &page, NULL);
730 if (err)
731 goto out_unlock;
732 - if (de->inode) {
733 + if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
734 + (de->file_type == EXT2_FT_FALLTHRU)) &&
735 + !ext2_match (namelen, name, de))) {
736 ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
737 de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
738 de->rec_len = ext2_rec_len_to_disk(name_len);
739 @@ -563,7 +642,60 @@ got_it:
740 /* OFFSET_CACHE */
741 out_put:
742 ext2_put_page(page);
743 -out:
744 + return err;
745 +out_unlock:
746 + unlock_page(page);
747 + goto out_put;
748 +}
749 +
750 +/*
751 + * Create a fallthru entry.
752 + */
753 +int ext2_fallthru_entry (struct inode *dir, struct dentry *dentry)
754 +{
755 + const char *name = dentry->d_name.name;
756 + int namelen = dentry->d_name.len;
757 + unsigned short rec_len, name_len;
758 + ext2_dirent * de;
759 + struct page *page;
760 + loff_t pos;
761 + int err;
762 +
763 + de = ext2_append_entry(dentry, &page);
764 + if (IS_ERR(de))
765 + return PTR_ERR(de);
766 +
767 + err = -EEXIST;
768 + if (ext2_match (namelen, name, de))
769 + goto out_unlock;
770 +
771 + name_len = EXT2_DIR_REC_LEN(de->name_len);
772 + rec_len = ext2_rec_len_from_disk(de->rec_len);
773 +
774 + pos = page_offset(page) +
775 + (char*)de - (char*)page_address(page);
776 + err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
777 + &page, NULL);
778 + if (err)
779 + goto out_unlock;
780 + if (de->inode || (de->file_type == EXT2_FT_WHT) ||
781 + (de->file_type == EXT2_FT_FALLTHRU)) {
782 + ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
783 + de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
784 + de->rec_len = ext2_rec_len_to_disk(name_len);
785 + de = de1;
786 + }
787 + de->name_len = namelen;
788 + memcpy(de->name, name, namelen);
789 + de->inode = 0;
790 + de->file_type = EXT2_FT_FALLTHRU;
791 + err = ext2_commit_chunk(page, pos, rec_len);
792 + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
793 + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
794 + mark_inode_dirty(dir);
795 + /* OFFSET_CACHE */
796 +out_put:
797 + ext2_put_page(page);
798 return err;
799 out_unlock:
800 unlock_page(page);
801 @@ -616,6 +748,70 @@ out:
802 return err;
803 }
804
805 +int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
806 + struct ext2_dir_entry_2 * de, struct page * page)
807 +{
808 + const char *name = dentry->d_name.name;
809 + int namelen = dentry->d_name.len;
810 + unsigned short rec_len, name_len;
811 + loff_t pos;
812 + int err;
813 +
814 + if (!de) {
815 + de = ext2_append_entry(dentry, &page);
816 + BUG_ON(!de);
817 + }
818 +
819 + err = -EEXIST;
820 + if (ext2_match (namelen, name, de) &&
821 + (de->file_type == EXT2_FT_WHT)) {
822 + ext2_error(dir->i_sb, __func__,
823 + "entry is already a whiteout in directory #%lu",
824 + dir->i_ino);
825 + goto out_unlock;
826 + }
827 +
828 + name_len = EXT2_DIR_REC_LEN(de->name_len);
829 + rec_len = ext2_rec_len_from_disk(de->rec_len);
830 +
831 + pos = page_offset(page) +
832 + (char*)de - (char*)page_address(page);
833 + err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
834 + &page, NULL);
835 + if (err)
836 + goto out_unlock;
837 + /*
838 + * We whiteout an existing entry. Do what ext2_delete_entry() would do,
839 + * except that we don't need to merge with the previous entry since
840 + * we are going to reuse it.
841 + */
842 + if (ext2_match (namelen, name, de))
843 + de->inode = 0;
844 + if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
845 + (de->file_type == EXT2_FT_FALLTHRU)) &&
846 + !ext2_match (namelen, name, de))) {
847 + ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
848 + de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
849 + de->rec_len = ext2_rec_len_to_disk(name_len);
850 + de = de1;
851 + }
852 + de->name_len = namelen;
853 + memcpy(de->name, name, namelen);
854 + de->inode = 0;
855 + de->file_type = EXT2_FT_WHT;
856 + err = ext2_commit_chunk(page, pos, rec_len);
857 + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
858 + EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
859 + mark_inode_dirty(dir);
860 + /* OFFSET_CACHE */
861 +out_put:
862 + ext2_put_page(page);
863 + return err;
864 +out_unlock:
865 + unlock_page(page);
866 + goto out_put;
867 +}
868 +
869 /*
870 * Set the first fragment of directory.
871 */
872 --- a/fs/ext2/ext2.h
873 +++ b/fs/ext2/ext2.h
874 @@ -102,9 +102,13 @@ extern void ext2_rsv_window_add(struct s
875 /* dir.c */
876 extern int ext2_add_link (struct dentry *, struct inode *);
877 extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
878 +extern ino_t ext2_inode_by_dentry(struct inode *, struct dentry *);
879 extern int ext2_make_empty(struct inode *, struct inode *);
880 extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
881 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
882 +extern int ext2_whiteout_entry (struct inode *, struct dentry *,
883 + struct ext2_dir_entry_2 *, struct page *);
884 +extern int ext2_fallthru_entry (struct inode *, struct dentry *);
885 extern int ext2_empty_dir (struct inode *);
886 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
887 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
888 --- a/fs/ext2/inode.c
889 +++ b/fs/ext2/inode.c
890 @@ -1178,7 +1178,8 @@ void ext2_set_inode_flags(struct inode *
891 {
892 unsigned int flags = EXT2_I(inode)->i_flags;
893
894 - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
895 + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
896 + S_OPAQUE);
897 if (flags & EXT2_SYNC_FL)
898 inode->i_flags |= S_SYNC;
899 if (flags & EXT2_APPEND_FL)
900 @@ -1189,6 +1190,8 @@ void ext2_set_inode_flags(struct inode *
901 inode->i_flags |= S_NOATIME;
902 if (flags & EXT2_DIRSYNC_FL)
903 inode->i_flags |= S_DIRSYNC;
904 + if (flags & EXT2_OPAQUE_FL)
905 + inode->i_flags |= S_OPAQUE;
906 }
907
908 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
909 @@ -1196,8 +1199,8 @@ void ext2_get_inode_flags(struct ext2_in
910 {
911 unsigned int flags = ei->vfs_inode.i_flags;
912
913 - ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
914 - EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
915 + ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|EXT2_IMMUTABLE_FL|
916 + EXT2_NOATIME_FL|EXT2_DIRSYNC_FL|EXT2_OPAQUE_FL);
917 if (flags & S_SYNC)
918 ei->i_flags |= EXT2_SYNC_FL;
919 if (flags & S_APPEND)
920 @@ -1208,6 +1211,8 @@ void ext2_get_inode_flags(struct ext2_in
921 ei->i_flags |= EXT2_NOATIME_FL;
922 if (flags & S_DIRSYNC)
923 ei->i_flags |= EXT2_DIRSYNC_FL;
924 + if (flags & S_OPAQUE)
925 + ei->i_flags |= EXT2_OPAQUE_FL;
926 }
927
928 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
929 --- a/fs/ext2/namei.c
930 +++ b/fs/ext2/namei.c
931 @@ -54,15 +54,16 @@ static inline int ext2_add_nondir(struct
932 * Methods themselves.
933 */
934
935 -static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
936 +static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry,
937 + struct nameidata *nd)
938 {
939 struct inode * inode;
940 ino_t ino;
941 -
942 +
943 if (dentry->d_name.len > EXT2_NAME_LEN)
944 return ERR_PTR(-ENAMETOOLONG);
945
946 - ino = ext2_inode_by_name(dir, &dentry->d_name);
947 + ino = ext2_inode_by_dentry(dir, dentry);
948 inode = NULL;
949 if (ino) {
950 inode = ext2_iget(dir->i_sb, ino);
951 @@ -230,6 +231,10 @@ static int ext2_mkdir(struct inode * dir
952 else
953 inode->i_mapping->a_ops = &ext2_aops;
954
955 + /* if we call mkdir on a whiteout create an opaque directory */
956 + if (dentry->d_flags & DCACHE_WHITEOUT)
957 + inode->i_flags |= S_OPAQUE;
958 +
959 inode_inc_link_count(inode);
960
961 err = ext2_make_empty(inode, dir);
962 @@ -293,6 +298,78 @@ static int ext2_rmdir (struct inode * di
963 return err;
964 }
965
966 +/*
967 + * Create a whiteout for the dentry
968 + */
969 +static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
970 + struct dentry *new_dentry)
971 +{
972 + struct inode * inode = dentry->d_inode;
973 + struct ext2_dir_entry_2 * de = NULL;
974 + struct page * page;
975 + int err = -ENOTEMPTY;
976 +
977 + if (!EXT2_HAS_INCOMPAT_FEATURE(dir->i_sb,
978 + EXT2_FEATURE_INCOMPAT_FILETYPE)) {
979 + ext2_error (dir->i_sb, "ext2_whiteout",
980 + "can't set whiteout filetype");
981 + err = -EPERM;
982 + goto out;
983 + }
984 +
985 + if (inode) {
986 + if (S_ISDIR(inode->i_mode) && !ext2_empty_dir(inode))
987 + goto out;
988 +
989 + err = -ENOENT;
990 + de = ext2_find_entry (dir, &dentry->d_name, &page);
991 + if (!de)
992 + goto out;
993 + lock_page(page);
994 + }
995 +
996 + err = ext2_whiteout_entry (dir, dentry, de, page);
997 + if (err)
998 + goto out;
999 +
1000 + spin_lock(&new_dentry->d_lock);
1001 + new_dentry->d_flags &= ~DCACHE_FALLTHRU;
1002 + new_dentry->d_flags |= DCACHE_WHITEOUT;
1003 + spin_unlock(&new_dentry->d_lock);
1004 + d_add(new_dentry, NULL);
1005 +
1006 + if (inode) {
1007 + inode->i_ctime = dir->i_ctime;
1008 + inode_dec_link_count(inode);
1009 + if (S_ISDIR(inode->i_mode)) {
1010 + inode->i_size = 0;
1011 + inode_dec_link_count(inode);
1012 + inode_dec_link_count(dir);
1013 + }
1014 + }
1015 + err = 0;
1016 +out:
1017 + return err;
1018 +}
1019 +
1020 +/*
1021 + * Create a fallthru entry.
1022 + */
1023 +static int ext2_fallthru (struct inode *dir, struct dentry *dentry)
1024 +{
1025 + int err;
1026 +
1027 + err = ext2_fallthru_entry(dir, dentry);
1028 + if (err)
1029 + return err;
1030 +
1031 + d_instantiate(dentry, NULL);
1032 + spin_lock(&dentry->d_lock);
1033 + dentry->d_flags |= DCACHE_FALLTHRU;
1034 + spin_unlock(&dentry->d_lock);
1035 + return 0;
1036 +}
1037 +
1038 static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
1039 struct inode * new_dir, struct dentry * new_dentry )
1040 {
1041 @@ -392,6 +469,8 @@ const struct inode_operations ext2_dir_i
1042 .mkdir = ext2_mkdir,
1043 .rmdir = ext2_rmdir,
1044 .mknod = ext2_mknod,
1045 + .whiteout = ext2_whiteout,
1046 + .fallthru = ext2_fallthru,
1047 .rename = ext2_rename,
1048 #ifdef CONFIG_EXT2_FS_XATTR
1049 .setxattr = generic_setxattr,
1050 --- a/fs/ext2/super.c
1051 +++ b/fs/ext2/super.c
1052 @@ -1062,6 +1062,13 @@ static int ext2_fill_super(struct super_
1053 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1054 ext2_warning(sb, __func__,
1055 "mounting ext3 filesystem as ext2");
1056 +
1057 + /*
1058 + * Whiteouts (and fallthrus) require explicit whiteout support.
1059 + */
1060 + if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_WHITEOUT))
1061 + sb->s_flags |= MS_WHITEOUT;
1062 +
1063 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1064 return 0;
1065
1066 --- a/fs/Kconfig
1067 +++ b/fs/Kconfig
1068 @@ -59,6 +59,14 @@ source "fs/notify/Kconfig"
1069
1070 source "fs/quota/Kconfig"
1071
1072 +config UNION_MOUNT
1073 + bool "Union mount support (EXPERIMENTAL)"
1074 + depends on EXPERIMENTAL
1075 + ---help---
1076 + If you say Y here, you will be able to mount file systems as
1077 + union mount stacks. This is a VFS based implementation and
1078 + should work with all file systems. If unsure, say N.
1079 +
1080 source "fs/autofs/Kconfig"
1081 source "fs/autofs4/Kconfig"
1082 source "fs/fuse/Kconfig"
1083 --- a/fs/libfs.c
1084 +++ b/fs/libfs.c
1085 @@ -133,6 +133,7 @@ int dcache_readdir(struct file * filp, v
1086 struct dentry *cursor = filp->private_data;
1087 struct list_head *p, *q = &cursor->d_u.d_child;
1088 ino_t ino;
1089 + int d_type;
1090 int i = filp->f_pos;
1091
1092 switch (i) {
1093 @@ -158,14 +159,25 @@ int dcache_readdir(struct file * filp, v
1094 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
1095 struct dentry *next;
1096 next = list_entry(p, struct dentry, d_u.d_child);
1097 - if (d_unhashed(next) || !next->d_inode)
1098 + if (d_unhashed(next) || (!next->d_inode && !d_is_fallthru(next)))
1099 continue;
1100
1101 + if (d_is_fallthru(next)) {
1102 + /* XXX Make up things we can
1103 + * only get out of the inode.
1104 + * Should probably really do a
1105 + * lookup instead. */
1106 + ino = 100; /* XXX Made up number of no significance */
1107 + d_type = DT_UNKNOWN;
1108 + } else {
1109 + ino = next->d_inode->i_ino;
1110 + d_type = dt_type(next->d_inode);
1111 + }
1112 +
1113 spin_unlock(&dcache_lock);
1114 if (filldir(dirent, next->d_name.name,
1115 next->d_name.len, filp->f_pos,
1116 - next->d_inode->i_ino,
1117 - dt_type(next->d_inode)) < 0)
1118 + ino, d_type) < 0)
1119 return 0;
1120 spin_lock(&dcache_lock);
1121 /* next is still alive */
1122 --- a/fs/Makefile
1123 +++ b/fs/Makefile
1124 @@ -52,6 +52,7 @@ obj-$(CONFIG_NFS_COMMON) += nfs_common/
1125 obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
1126
1127 obj-y += quota/
1128 +obj-$(CONFIG_UNION_MOUNT) += union.o
1129
1130 obj-$(CONFIG_PROC_FS) += proc/
1131 obj-y += partitions/
1132 --- a/fs/namei.c
1133 +++ b/fs/namei.c
1134 @@ -33,6 +33,7 @@
1135 #include <linux/fcntl.h>
1136 #include <linux/device_cgroup.h>
1137 #include <linux/fs_struct.h>
1138 +#include <linux/union.h>
1139 #include <asm/uaccess.h>
1140
1141 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
1142 @@ -242,16 +243,17 @@ int generic_permission(struct inode *ino
1143 }
1144
1145 /**
1146 - * inode_permission - check for access rights to a given inode
1147 + * __inode_permission - check for access rights to a given inode
1148 * @inode: inode to check permission on
1149 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1150 + * @rofs: check for read-only fs
1151 *
1152 * Used to check for read/write/execute permissions on an inode.
1153 * We use "fsuid" for this, letting us set arbitrary permissions
1154 * for filesystem access without changing the "normal" uids which
1155 * are used for other things.
1156 */
1157 -int inode_permission(struct inode *inode, int mask)
1158 +int __inode_permission(struct inode *inode, int mask, int rofs)
1159 {
1160 int retval;
1161
1162 @@ -261,7 +263,7 @@ int inode_permission(struct inode *inode
1163 /*
1164 * Nobody gets write access to a read-only fs.
1165 */
1166 - if (IS_RDONLY(inode) &&
1167 + if ((rofs & IS_RDONLY(inode)) &&
1168 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1169 return -EROFS;
1170
1171 @@ -289,6 +291,18 @@ int inode_permission(struct inode *inode
1172 }
1173
1174 /**
1175 + * inode_permission - check for access rights to a given inode
1176 + * @inode: inode to check permission on
1177 + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1178 + *
1179 + * This version pays attention to the MS_RDONLY flag on the fs.
1180 + */
1181 +int inode_permission(struct inode *inode, int mask)
1182 +{
1183 + return __inode_permission(inode, mask, 1);
1184 +}
1185 +
1186 +/**
1187 * file_permission - check for additional access rights to a given file
1188 * @file: file to check access rights for
1189 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1190 @@ -417,15 +431,10 @@ do_revalidate(struct dentry *dentry, str
1191 * Internal lookup() using the new generic dcache.
1192 * SMP-safe
1193 */
1194 -static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
1195 +static struct dentry *cache_lookup(struct dentry *parent, struct qstr *name,
1196 + struct nameidata *nd)
1197 {
1198 - struct dentry * dentry = __d_lookup(parent, name);
1199 -
1200 - /* lockess __d_lookup may fail due to concurrent d_move()
1201 - * in some unrelated directory, so try with d_lookup
1202 - */
1203 - if (!dentry)
1204 - dentry = d_lookup(parent, name);
1205 + struct dentry *dentry = d_lookup(parent, name);
1206
1207 if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1208 dentry = do_revalidate(dentry, nd);
1209 @@ -434,6 +443,208 @@ static struct dentry * cached_lookup(str
1210 }
1211
1212 /*
1213 + * Theory of operation for opaque, whiteout, and fallthru:
1214 + *
1215 + * whiteout: Unconditionally stop lookup here - ENOENT
1216 + *
1217 + * opaque: Don't lookup in directories lower in the union stack
1218 + *
1219 + * fallthru: While looking up an entry, ignore the opaque flag for the
1220 + * current directory only.
1221 + *
1222 + * A union stack is a linked list of directory dentries which appear
1223 + * in the same place in the namespace. When constructing the union
1224 + * stack, we include directories below opaque directories so that we
1225 + * can properly handle fallthrus. All non-fallthru lookups have to
1226 + * check for the opaque flag on the parent directory and obey it.
1227 + *
1228 + * In general, the code pattern is to lookup the the topmost entry
1229 + * first (either the first visible non-negative dentry or a negative
1230 + * dentry in the topmost layer of the union), then build the union
1231 + * stack for the newly looked-up entry (if it is a directory).
1232 + */
1233 +
1234 +/**
1235 + * __cache_lookup_topmost - lookup the topmost (non-)negative dentry
1236 + *
1237 + * @nd - parent's nameidata
1238 + * @name - pathname part to lookup
1239 + * @path - found dentry for pathname part
1240 + *
1241 + * This is used for union mount lookups from dcache. The first non-negative
1242 + * dentry is searched on all layers of the union stack. Otherwise the topmost
1243 + * negative dentry is returned.
1244 + */
1245 +static int __cache_lookup_topmost(struct nameidata *nd, struct qstr *name,
1246 + struct path *path)
1247 +{
1248 + struct dentry *dentry;
1249 +
1250 + dentry = d_lookup(nd->path.dentry, name);
1251 + if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1252 + dentry = do_revalidate(dentry, nd);
1253 +
1254 + /*
1255 + * Remember the topmost negative dentry in case we don't find anything
1256 + */
1257 + path->dentry = dentry;
1258 + path->mnt = dentry ? nd->path.mnt : NULL;
1259 +
1260 + if (!dentry || (dentry->d_inode || d_is_whiteout(dentry)))
1261 + return !dentry;
1262 +
1263 + /* Keep going through opaque directories if we found a fallthru */
1264 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(dentry))
1265 + return !dentry;
1266 +
1267 + /* look for the first non-negative or whiteout dentry */
1268 +
1269 + while (follow_union_down(&nd->path)) {
1270 + dentry = d_hash_and_lookup(nd->path.dentry, name);
1271 +
1272 + /*
1273 + * If parts of the union stack are not in the dcache we need
1274 + * to do a real lookup
1275 + */
1276 + if (!dentry)
1277 + goto out_dput;
1278 +
1279 + /*
1280 + * If parts of the union don't survive the revalidation we
1281 + * need to do a real lookup
1282 + */
1283 + if (dentry->d_op && dentry->d_op->d_revalidate) {
1284 + dentry = do_revalidate(dentry, nd);
1285 + if (!dentry)
1286 + goto out_dput;
1287 + }
1288 +
1289 + if (dentry->d_inode || d_is_whiteout(dentry))
1290 + goto out_dput;
1291 +
1292 + /* Stop the lookup on opaque parent and non-fallthru child */
1293 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(dentry))
1294 + goto out_dput;
1295 +
1296 + dput(dentry);
1297 + }
1298 +
1299 + return !dentry;
1300 +
1301 +out_dput:
1302 + dput(path->dentry);
1303 + path->dentry = dentry;
1304 + path->mnt = dentry ? mntget(nd->path.mnt) : NULL;
1305 + return !dentry;
1306 +}
1307 +
1308 +/**
1309 + * __cache_lookup_build_union - build the union stack for this part,
1310 + * cached version
1311 + *
1312 + * This is called after you have the topmost dentry in @path.
1313 + */
1314 +static int __cache_lookup_build_union(struct nameidata *nd, struct qstr *name,
1315 + struct path *path)
1316 +{
1317 + struct path last = *path;
1318 + struct dentry *dentry;
1319 +
1320 + while (follow_union_down(&nd->path)) {
1321 + dentry = d_hash_and_lookup(nd->path.dentry, name);
1322 + if (!dentry)
1323 + return 1;
1324 +
1325 + if (dentry->d_op && dentry->d_op->d_revalidate) {
1326 + dentry = do_revalidate(dentry, nd);
1327 + if (!dentry)
1328 + return 1;
1329 + }
1330 +
1331 + if (d_is_whiteout(dentry)) {
1332 + dput(dentry);
1333 + break;
1334 + }
1335 +
1336 + if (!dentry->d_inode) {
1337 + dput(dentry);
1338 + continue;
1339 + }
1340 +
1341 + /* only directories can be part of a union stack */
1342 + if (!S_ISDIR(dentry->d_inode->i_mode)) {
1343 + dput(dentry);
1344 + break;
1345 + }
1346 +
1347 + /* Add the newly discovered dir to the union stack */
1348 + append_to_union(last.mnt, last.dentry, nd->path.mnt, dentry);
1349 +
1350 + if (last.dentry != path->dentry)
1351 + path_put(&last);
1352 + last.dentry = dentry;
1353 + last.mnt = mntget(nd->path.mnt);
1354 + }
1355 +
1356 + if (last.dentry != path->dentry)
1357 + path_put(&last);
1358 +
1359 + return 0;
1360 +}
1361 +
1362 +/**
1363 + * cache_lookup_union - lookup a single pathname part from dcache
1364 + *
1365 + * This is a union mount capable version of what d_lookup() & revalidate()
1366 + * would do. This function returns a valid (union) dentry on success.
1367 + *
1368 + * Remember: On failure it means that parts of the union aren't cached. You
1369 + * should call real_lookup() afterwards to find the proper (union) dentry.
1370 + */
1371 +static int cache_lookup_union(struct nameidata *nd, struct qstr *name,
1372 + struct path *path)
1373 +{
1374 + int res ;
1375 +
1376 + if (!IS_MNT_UNION(nd->path.mnt)) {
1377 + path->dentry = cache_lookup(nd->path.dentry, name, nd);
1378 + path->mnt = path->dentry ? nd->path.mnt : NULL;
1379 + res = path->dentry ? 0 : 1;
1380 + } else {
1381 + struct path safe = {
1382 + .dentry = nd->path.dentry,
1383 + .mnt = nd->path.mnt
1384 + };
1385 +
1386 + path_get(&safe);
1387 + res = __cache_lookup_topmost(nd, name, path);
1388 + if (res)
1389 + goto out;
1390 +
1391 + /* only directories can be part of a union stack */
1392 + if (!path->dentry->d_inode ||
1393 + !S_ISDIR(path->dentry->d_inode->i_mode))
1394 + goto out;
1395 +
1396 + /* Build the union stack for this part */
1397 + res = __cache_lookup_build_union(nd, name, path);
1398 + if (res) {
1399 + dput(path->dentry);
1400 + if (path->mnt != safe.mnt)
1401 + mntput(path->mnt);
1402 + goto out;
1403 + }
1404 +
1405 +out:
1406 + path_put(&nd->path);
1407 + nd->path.dentry = safe.dentry;
1408 + nd->path.mnt = safe.mnt;
1409 + }
1410 +
1411 + return res;
1412 +}
1413 +
1414 +/*
1415 * Short-cut version of permission(), for calling by
1416 * path_walk(), when dcache lock is held. Combines parts
1417 * of permission() and generic_permission(), and tests ONLY for
1418 @@ -473,10 +684,11 @@ ok:
1419 * make sure that nobody added the entry to the dcache in the meantime..
1420 * SMP-safe
1421 */
1422 -static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
1423 +static int real_lookup(struct nameidata *nd, struct qstr *name,
1424 + struct path *path)
1425 {
1426 - struct dentry * result;
1427 - struct inode *dir = parent->d_inode;
1428 + struct inode *dir = nd->path.dentry->d_inode;
1429 + int res = 0;
1430
1431 mutex_lock(&dir->i_mutex);
1432 /*
1433 @@ -493,27 +705,36 @@ static struct dentry * real_lookup(struc
1434 *
1435 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
1436 */
1437 - result = d_lookup(parent, name);
1438 - if (!result) {
1439 + path->dentry = d_lookup(nd->path.dentry, name);
1440 + path->mnt = nd->path.mnt;
1441 + if (!path->dentry) {
1442 struct dentry *dentry;
1443
1444 /* Don't create child dentry for a dead directory. */
1445 - result = ERR_PTR(-ENOENT);
1446 - if (IS_DEADDIR(dir))
1447 + if (IS_DEADDIR(dir)) {
1448 + res = -ENOENT;
1449 goto out_unlock;
1450 + }
1451
1452 - dentry = d_alloc(parent, name);
1453 - result = ERR_PTR(-ENOMEM);
1454 + dentry = d_alloc(nd->path.dentry, name);
1455 if (dentry) {
1456 - result = dir->i_op->lookup(dir, dentry, nd);
1457 - if (result)
1458 + path->dentry = dir->i_op->lookup(dir, dentry, nd);
1459 + if (path->dentry) {
1460 dput(dentry);
1461 - else
1462 - result = dentry;
1463 + if (IS_ERR(path->dentry)) {
1464 + res = PTR_ERR(path->dentry);
1465 + path->dentry = NULL;
1466 + path->mnt = NULL;
1467 + }
1468 + } else
1469 + path->dentry = dentry;
1470 + } else {
1471 + res = -ENOMEM;
1472 + path->mnt = NULL;
1473 }
1474 out_unlock:
1475 mutex_unlock(&dir->i_mutex);
1476 - return result;
1477 + return res;
1478 }
1479
1480 /*
1481 @@ -521,12 +742,170 @@ out_unlock:
1482 * we waited on the semaphore. Need to revalidate.
1483 */
1484 mutex_unlock(&dir->i_mutex);
1485 - if (result->d_op && result->d_op->d_revalidate) {
1486 - result = do_revalidate(result, nd);
1487 - if (!result)
1488 - result = ERR_PTR(-ENOENT);
1489 + if (path->dentry->d_op && path->dentry->d_op->d_revalidate) {
1490 + path->dentry = do_revalidate(path->dentry, nd);
1491 + if (!path->dentry) {
1492 + res = -ENOENT;
1493 + path->mnt = NULL;
1494 + }
1495 + if (IS_ERR(path->dentry)) {
1496 + res = PTR_ERR(path->dentry);
1497 + path->dentry = NULL;
1498 + path->mnt = NULL;
1499 + }
1500 }
1501 - return result;
1502 +
1503 + return res;
1504 +}
1505 +
1506 +/**
1507 + * __real_lookup_topmost - lookup topmost dentry, non-cached version
1508 + *
1509 + * If we reach a dentry with restricted access, we just stop the lookup
1510 + * because we shouldn't see through that dentry. Same thing for dentry
1511 + * type mismatch and whiteouts.
1512 + *
1513 + * FIXME:
1514 + * - handle union stacks in use
1515 + * - handle union stacks mounted upon union stacks
1516 + * - avoid unnecessary allocations of union locks
1517 + */
1518 +static int __real_lookup_topmost(struct nameidata *nd, struct qstr *name,
1519 + struct path *path)
1520 +{
1521 + struct path next;
1522 + int err;
1523 +
1524 + err = real_lookup(nd, name, path);
1525 + if (err)
1526 + return err;
1527 +
1528 + if (path->dentry->d_inode || d_is_whiteout(path->dentry))
1529 + return 0;
1530 +
1531 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(path->dentry))
1532 + return 0;
1533 +
1534 + while (follow_union_down(&nd->path)) {
1535 + name->hash = full_name_hash(name->name, name->len);
1536 + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1537 + err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1538 + name);
1539 + if (err < 0)
1540 + goto out;
1541 + }
1542 +
1543 + err = real_lookup(nd, name, &next);
1544 + if (err)
1545 + goto out;
1546 +
1547 + if (next.dentry->d_inode || d_is_whiteout(next.dentry)) {
1548 + dput(path->dentry);
1549 + mntget(next.mnt);
1550 + *path = next;
1551 + goto out;
1552 + }
1553 +
1554 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(next.dentry))
1555 + goto out;
1556 +
1557 + dput(next.dentry);
1558 + }
1559 +out:
1560 + if (err)
1561 + dput(path->dentry);
1562 + return err;
1563 +}
1564 +
1565 +/**
1566 + * __real_lookup_build_union: build the union stack for this pathname
1567 + * part, non-cached version
1568 + *
1569 + * Called when not all parts of the union stack are in cache
1570 + */
1571 +
1572 +static int __real_lookup_build_union(struct nameidata *nd, struct qstr *name,
1573 + struct path *path)
1574 +{
1575 + struct path last = *path;
1576 + struct path next;
1577 + int err = 0;
1578 +
1579 + while (follow_union_down(&nd->path)) {
1580 + /* We need to recompute the hash for lower layer lookups */
1581 + name->hash = full_name_hash(name->name, name->len);
1582 + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1583 + err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1584 + name);
1585 + if (err < 0)
1586 + goto out;
1587 + }
1588 +
1589 + err = real_lookup(nd, name, &next);
1590 + if (err)
1591 + goto out;
1592 +
1593 + if (d_is_whiteout(next.dentry)) {
1594 + dput(next.dentry);
1595 + break;
1596 + }
1597 +
1598 + if (!next.dentry->d_inode) {
1599 + dput(next.dentry);
1600 + continue;
1601 + }
1602 +
1603 + /* only directories can be part of a union stack */
1604 + if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
1605 + dput(next.dentry);
1606 + break;
1607 + }
1608 +
1609 + /* now we know we found something "real" */
1610 + append_to_union(last.mnt, last.dentry, next.mnt, next.dentry);
1611 +
1612 + if (last.dentry != path->dentry)
1613 + path_put(&last);
1614 + last.dentry = next.dentry;
1615 + last.mnt = mntget(next.mnt);
1616 + }
1617 +
1618 + if (last.dentry != path->dentry)
1619 + path_put(&last);
1620 +out:
1621 + return err;
1622 +}
1623 +
1624 +static int real_lookup_union(struct nameidata *nd, struct qstr *name,
1625 + struct path *path)
1626 +{
1627 + struct path safe = { .dentry = nd->path.dentry, .mnt = nd->path.mnt };
1628 + int res ;
1629 +
1630 + path_get(&safe);
1631 + res = __real_lookup_topmost(nd, name, path);
1632 + if (res)
1633 + goto out;
1634 +
1635 + /* only directories can be part of a union stack */
1636 + if (!path->dentry->d_inode ||
1637 + !S_ISDIR(path->dentry->d_inode->i_mode))
1638 + goto out;
1639 +
1640 + /* Build the union stack for this part */
1641 + res = __real_lookup_build_union(nd, name, path);
1642 + if (res) {
1643 + dput(path->dentry);
1644 + if (path->mnt != safe.mnt)
1645 + mntput(path->mnt);
1646 + goto out;
1647 + }
1648 +
1649 +out:
1650 + path_put(&nd->path);
1651 + nd->path.dentry = safe.dentry;
1652 + nd->path.mnt = safe.mnt;
1653 + return res;
1654 }
1655
1656 /*
1657 @@ -629,11 +1008,8 @@ static __always_inline int __do_follow_l
1658 touch_atime(path->mnt, dentry);
1659 nd_set_link(nd, NULL);
1660
1661 - if (path->mnt != nd->path.mnt) {
1662 - path_to_nameidata(path, nd);
1663 - dget(dentry);
1664 - }
1665 - mntget(path->mnt);
1666 + if (path->mnt == nd->path.mnt)
1667 + mntget(nd->path.mnt);
1668 cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
1669 error = PTR_ERR(cookie);
1670 if (!IS_ERR(cookie)) {
1671 @@ -721,7 +1097,7 @@ static int __follow_mount(struct path *p
1672 return res;
1673 }
1674
1675 -static void follow_mount(struct path *path)
1676 +void follow_mount(struct path *path)
1677 {
1678 while (d_mountpoint(path->dentry)) {
1679 struct vfsmount *mounted = lookup_mnt(path);
1680 @@ -786,6 +1162,7 @@ static __always_inline void follow_dotdo
1681 nd->path.mnt = parent;
1682 }
1683 follow_mount(&nd->path);
1684 + follow_union_mount(&nd->path);
1685 }
1686
1687 /*
1688 @@ -796,35 +1173,55 @@ static __always_inline void follow_dotdo
1689 static int do_lookup(struct nameidata *nd, struct qstr *name,
1690 struct path *path)
1691 {
1692 - struct vfsmount *mnt = nd->path.mnt;
1693 - struct dentry *dentry = __d_lookup(nd->path.dentry, name);
1694 + int err;
1695 +
1696 + if (IS_MNT_UNION(nd->path.mnt))
1697 + goto need_union_lookup;
1698
1699 - if (!dentry)
1700 + path->dentry = __d_lookup(nd->path.dentry, name);
1701 + path->mnt = nd->path.mnt;
1702 + if (!path->dentry)
1703 goto need_lookup;
1704 - if (dentry->d_op && dentry->d_op->d_revalidate)
1705 + if (path->dentry->d_op && path->dentry->d_op->d_revalidate)
1706 goto need_revalidate;
1707 +
1708 done:
1709 - path->mnt = mnt;
1710 - path->dentry = dentry;
1711 - __follow_mount(path);
1712 + if (nd->path.mnt != path->mnt) {
1713 + nd->um_flags |= LAST_LOWLEVEL;
1714 + follow_mount(path);
1715 + } else
1716 + __follow_mount(path);
1717 + follow_union_mount(path);
1718 return 0;
1719
1720 need_lookup:
1721 - dentry = real_lookup(nd->path.dentry, name, nd);
1722 - if (IS_ERR(dentry))
1723 + err = real_lookup(nd, name, path);
1724 + if (err)
1725 + goto fail;
1726 + goto done;
1727 +
1728 +need_union_lookup:
1729 + err = cache_lookup_union(nd, name, path);
1730 + if (!err && path->dentry)
1731 + goto done;
1732 +
1733 + err = real_lookup_union(nd, name, path);
1734 + if (err)
1735 goto fail;
1736 goto done;
1737
1738 need_revalidate:
1739 - dentry = do_revalidate(dentry, nd);
1740 - if (!dentry)
1741 + path->dentry = do_revalidate(path->dentry, nd);
1742 + if (!path->dentry)
1743 goto need_lookup;
1744 - if (IS_ERR(dentry))
1745 + if (IS_ERR(path->dentry)) {
1746 + err = PTR_ERR(path->dentry);
1747 goto fail;
1748 + }
1749 goto done;
1750
1751 fail:
1752 - return PTR_ERR(dentry);
1753 + return err;
1754 }
1755
1756 /*
1757 @@ -851,6 +1248,8 @@ static int __link_path_walk(const char *
1758 if (nd->depth)
1759 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
1760
1761 + follow_union_mount(&nd->path);
1762 +
1763 /* At this point we know we have a real path component. */
1764 for(;;) {
1765 unsigned long hash;
1766 @@ -913,6 +1312,44 @@ static int __link_path_walk(const char *
1767 if (err)
1768 break;
1769
1770 + /*
1771 + * We want to create this element on the top level
1772 + * file system in two cases:
1773 + *
1774 + * - We are specifically told to - LOOKUP_TOPMOST.
1775 + * - This is a directory, and it does not yet exist on
1776 + * the top level. Various tricks only work if
1777 + * directories always exist on the top level.
1778 + *
1779 + * In either case, only create this element on the top
1780 + * level if the last element is located on the lower
1781 + * level. If the last element is located on the top
1782 + * level, then every single element in the path
1783 + * already exists on the top level.
1784 + *
1785 + * Note that we can assume that the parent is on the
1786 + * top level since we always create the directory on
1787 + * the top level.
1788 + */
1789 +
1790 + if ((nd->um_flags & LAST_LOWLEVEL) &&
1791 + ((next.dentry->d_inode &&
1792 + S_ISDIR(next.dentry->d_inode->i_mode) &&
1793 + (nd->path.mnt != next.mnt)) ||
1794 + (nd->flags & LOOKUP_TOPMOST))) {
1795 + struct dentry *dentry;
1796 +
1797 + dentry = union_create_topmost(nd, &this, &next);
1798 + if (IS_ERR(dentry)) {
1799 + err = PTR_ERR(dentry);
1800 + goto out_dput;
1801 + }
1802 + path_put_conditional(&next, nd);
1803 + next.mnt = nd->path.mnt;
1804 + next.dentry = dentry;
1805 + nd->um_flags &= ~LAST_LOWLEVEL;
1806 + }
1807 +
1808 err = -ENOENT;
1809 inode = next.dentry->d_inode;
1810 if (!inode)
1811 @@ -962,6 +1399,25 @@ last_component:
1812 err = do_lookup(nd, &this, &next);
1813 if (err)
1814 break;
1815 +
1816 + if ((nd->um_flags & LAST_LOWLEVEL) &&
1817 + ((next.dentry->d_inode &&
1818 + S_ISDIR(next.dentry->d_inode->i_mode) &&
1819 + (nd->path.mnt != next.mnt)) ||
1820 + (nd->flags & LOOKUP_TOPMOST))) {
1821 + struct dentry *dentry;
1822 +
1823 + dentry = union_create_topmost(nd, &this, &next);
1824 + if (IS_ERR(dentry)) {
1825 + err = PTR_ERR(dentry);
1826 + goto out_dput;
1827 + }
1828 + path_put_conditional(&next, nd);
1829 + next.mnt = nd->path.mnt;
1830 + next.dentry = dentry;
1831 + nd->um_flags &= ~LAST_LOWLEVEL;
1832 + }
1833 +
1834 inode = next.dentry->d_inode;
1835 if ((lookup_flags & LOOKUP_FOLLOW)
1836 && inode && inode->i_op->follow_link) {
1837 @@ -1029,6 +1485,7 @@ static int path_init(int dfd, const char
1838
1839 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1840 nd->flags = flags;
1841 + nd->um_flags = 0;
1842 nd->depth = 0;
1843 nd->root.mnt = NULL;
1844
1845 @@ -1172,61 +1629,437 @@ static int path_lookup_open(int dfd, con
1846 }
1847
1848 static struct dentry *__lookup_hash(struct qstr *name,
1849 - struct dentry *base, struct nameidata *nd)
1850 + struct dentry *base, struct nameidata *nd)
1851 +{
1852 + struct dentry *dentry;
1853 + struct inode *inode;
1854 + int err;
1855 +
1856 + inode = base->d_inode;
1857 +
1858 + /*
1859 + * See if the low-level filesystem might want
1860 + * to use its own hash..
1861 + */
1862 + if (base->d_op && base->d_op->d_hash) {
1863 + err = base->d_op->d_hash(base, name);
1864 + dentry = ERR_PTR(err);
1865 + if (err < 0)
1866 + goto out;
1867 + }
1868 +
1869 + dentry = cache_lookup(base, name, nd);
1870 + if (!dentry) {
1871 + struct dentry *new;
1872 +
1873 + /* Don't create child dentry for a dead directory. */
1874 + dentry = ERR_PTR(-ENOENT);
1875 + if (IS_DEADDIR(inode))
1876 + goto out;
1877 +
1878 + new = d_alloc(base, name);
1879 + dentry = ERR_PTR(-ENOMEM);
1880 + if (!new)
1881 + goto out;
1882 + dentry = inode->i_op->lookup(inode, new, nd);
1883 + if (!dentry)
1884 + dentry = new;
1885 + else
1886 + dput(new);
1887 + }
1888 +out:
1889 + return dentry;
1890 +}
1891 +
1892 +/*
1893 + * Restricted form of lookup. Doesn't follow links, single-component only,
1894 + * needs parent already locked. Doesn't follow mounts.
1895 + * SMP-safe.
1896 + */
1897 +static int lookup_hash(struct nameidata *nd, struct qstr *name,
1898 + struct path *path)
1899 +{
1900 + int err;
1901 +
1902 + err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
1903 + if (err)
1904 + return err;
1905 + path->mnt = nd->path.mnt;
1906 + path->dentry = __lookup_hash(name, nd->path.dentry, nd);
1907 + if (IS_ERR(path->dentry)) {
1908 + err = PTR_ERR(path->dentry);
1909 + path->dentry = NULL;
1910 + path->mnt = NULL;
1911 + }
1912 + return err;
1913 +}
1914 +
1915 +static int __hash_lookup_topmost(struct nameidata *nd, struct qstr *name,
1916 + struct path *path)
1917 +{
1918 + struct path next;
1919 + int err;
1920 +
1921 + err = lookup_hash(nd, name, path);
1922 + if (err)
1923 + return err;
1924 +
1925 + if (path->dentry->d_inode || d_is_whiteout(path->dentry))
1926 + return 0;
1927 +
1928 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(path->dentry))
1929 + return 0;
1930 +
1931 + while (follow_union_down(&nd->path)) {
1932 + name->hash = full_name_hash(name->name, name->len);
1933 + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1934 + err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1935 + name);
1936 + if (err < 0)
1937 + goto out;
1938 + }
1939 +
1940 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
1941 + err = lookup_hash(nd, name, &next);
1942 + mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
1943 + if (err)
1944 + goto out;
1945 +
1946 + if (next.dentry->d_inode || d_is_whiteout(next.dentry)) {
1947 + dput(path->dentry);
1948 + mntget(next.mnt);
1949 + *path = next;
1950 + goto out;
1951 + }
1952 +
1953 + if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(next.dentry))
1954 + goto out;
1955 +
1956 + dput(next.dentry);
1957 + }
1958 +out:
1959 + if (err)
1960 + dput(path->dentry);
1961 + return err;
1962 +}
1963 +
1964 +static int __hash_lookup_build_union(struct nameidata *nd, struct qstr *name,
1965 + struct path *path)
1966 +{
1967 + struct path last = *path;
1968 + struct path next;
1969 + int err = 0;
1970 +
1971 + while (follow_union_down(&nd->path)) {
1972 + /* We need to recompute the hash for lower layer lookups */
1973 + name->hash = full_name_hash(name->name, name->len);
1974 + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1975 + err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1976 + name);
1977 + if (err < 0)
1978 + goto out;
1979 + }
1980 +
1981 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
1982 + err = lookup_hash(nd, name, &next);
1983 + mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
1984 + if (err)
1985 + goto out;
1986 +
1987 + if (d_is_whiteout(next.dentry)) {
1988 + dput(next.dentry);
1989 + break;
1990 + }
1991 +
1992 + if (!next.dentry->d_inode) {
1993 + dput(next.dentry);
1994 + continue;
1995 + }
1996 +
1997 + /* only directories can be part of a union stack */
1998 + if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
1999 + dput(next.dentry);
2000 + break;
2001 + }
2002 +
2003 + /* now we know we found something "real" */
2004 + append_to_union(last.mnt, last.dentry, next.mnt, next.dentry);
2005 +
2006 + if (last.dentry != path->dentry)
2007 + path_put(&last);
2008 + last.dentry = next.dentry;
2009 + last.mnt = mntget(next.mnt);
2010 + }
2011 +
2012 + if (last.dentry != path->dentry)
2013 + path_put(&last);
2014 +out:
2015 + return err;
2016 +}
2017 +
2018 +int hash_lookup_union(struct nameidata *nd, struct qstr *name,
2019 + struct path *path)
2020 +{
2021 + struct path safe = { .dentry = nd->path.dentry, .mnt = nd->path.mnt };
2022 + int res ;
2023 +
2024 + path_get(&safe);
2025 + res = __hash_lookup_topmost(nd, name, path);
2026 + if (res)
2027 + goto out;
2028 +
2029 + /* only directories can be part of a union stack */
2030 + if (!path->dentry->d_inode ||
2031 + !S_ISDIR(path->dentry->d_inode->i_mode))
2032 + goto out;
2033 +
2034 + /* Build the union stack for this part */
2035 + res = __hash_lookup_build_union(nd, name, path);
2036 + if (res) {
2037 + dput(path->dentry);
2038 + if (path->mnt != safe.mnt)
2039 + mntput(path->mnt);
2040 + goto out;
2041 + }
2042 +
2043 +out:
2044 + path_put(&nd->path);
2045 + nd->path.dentry = safe.dentry;
2046 + nd->path.mnt = safe.mnt;
2047 + return res;
2048 +}
2049 +
2050 +/**
2051 + * do_union_hash_lookup() - walk down the union stack and lookup_hash()
2052 + * @nd: nameidata of parent to lookup from
2053 + * @name: pathname component to lookup
2054 + * @path: path to store result of lookup in
2055 + *
2056 + * Walk down the union stack and search for single pathname component name. It
2057 + * is assumed that the caller already did a lookup_hash() in the topmost parent
2058 + * that gave negative lookup result. Therefore this does call lookup_hash() in
2059 + * every lower layer (!) of the union stack. If a directory is found the union
2060 + * stack for that is assembled as well.
2061 + *
2062 + * Note:
2063 + * The caller needs to take care of holding a valid reference to the topmost
2064 + * parent.
2065 + * On error we leave @path untouched as well as when we don't find anything.
2066 + */
2067 +static int do_union_hash_lookup(struct nameidata *nd, struct qstr *name,
2068 + struct path *path)
2069 +{
2070 + struct path next;
2071 + int err = 0;
2072 +
2073 + while (follow_union_down(&nd->path)) {
2074 + /* rehash because of d_op->d_hash() by the previous layer */
2075 + name->hash = full_name_hash(name->name, name->len);
2076 +
2077 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2078 + err = lookup_hash(nd, name, &next);
2079 + mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
2080 +
2081 + if (err)
2082 + break;
2083 +
2084 + if (next.dentry->d_inode) {
2085 + mntget(next.mnt);
2086 + if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
2087 + *path = next;
2088 + break;
2089 + }
2090 + err = __hash_lookup_build_union(nd, name, &next);
2091 + if (err)
2092 + path_put(&next);
2093 + else
2094 + *path = next;
2095 + break;
2096 + }
2097 +
2098 + path_put_conditional(&next, nd);
2099 +
2100 + if ((IS_OPAQUE(nd->path.dentry->d_inode) &&
2101 + !d_is_fallthru(next.dentry)) ||
2102 + d_is_whiteout(next.dentry))
2103 + break;
2104 + }
2105 +
2106 + return err;
2107 +}
2108 +
2109 +/**
2110 + * _hash_lookup_union() - lookup single pathname component
2111 + * @nd: nameidata of parent to lookup from
2112 + * @name: pathname component to lookup
2113 + * @path: path to store result of lookup in
2114 + *
2115 + * Returns the topmost parent locked and the target dentry found in the union
2116 + * or the topmost negative target dentry otherwise.
2117 + *
2118 + * Note:
2119 + * Returns topmost parent locked even on error.
2120 + */
2121 +static int _hash_lookup_union(struct nameidata *nd, struct qstr *name,
2122 + struct path *path)
2123 +{
2124 + struct path parent = nd->path;
2125 + struct path topmost;
2126 + int err;
2127 +
2128 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2129 + err = lookup_hash(nd, name, path);
2130 + if (err)
2131 + return err;
2132 +
2133 + /* return if we found something and it isn't a directory we are done */
2134 + if (path->dentry->d_inode && !S_ISDIR(path->dentry->d_inode->i_mode))
2135 + return 0;
2136 +
2137 + /* stop lookup if the parent directory is marked opaque */
2138 + if ((IS_OPAQUE(nd->path.dentry->d_inode) &&
2139 + !d_is_fallthru(path->dentry)) ||
2140 + d_is_whiteout(path->dentry))
2141 + return 0;
2142 +
2143 + if (!strcmp(path->mnt->mnt_sb->s_type->name, "proc") ||
2144 + !strcmp(path->mnt->mnt_sb->s_type->name, "sysfs"))
2145 + return 0;
2146 +
2147 + mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
2148 +
2149 + /*
2150 + * safe a reference to the topmost parent for walking the union stack
2151 + */
2152 + path_get(&parent);
2153 + topmost = *path;
2154 +
2155 + if (path->dentry->d_inode && S_ISDIR(path->dentry->d_inode->i_mode)) {
2156 + err = __hash_lookup_build_union(nd, name, path);
2157 + if (err)
2158 + goto err_lock_parent;
2159 + goto out_lock_and_revalidate_parent;
2160 + }
2161 +
2162 + err = do_union_hash_lookup(nd, name, path);
2163 + if (err)
2164 + goto err_lock_parent;
2165 +
2166 +out_lock_and_revalidate_parent:
2167 + /* seems that we haven't found anything, so return the topmost */
2168 + path_to_nameidata(&parent, nd);
2169 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2170 +
2171 + if (topmost.dentry == path->dentry) {
2172 + spin_lock(&path->dentry->d_lock);
2173 + if (nd->path.dentry != path->dentry->d_parent) {
2174 + spin_unlock(&path->dentry->d_lock);
2175 + dput(path->dentry);
2176 + name->hash = full_name_hash(name->name, name->len);
2177 + err = lookup_hash(nd, name, path);
2178 + if (err)
2179 + return err;
2180 + /* FIXME: What if we find a directory here ... */
2181 + return err;
2182 + }
2183 + spin_unlock(&path->dentry->d_lock);
2184 + } else
2185 + dput(topmost.dentry);
2186 +
2187 + return 0;
2188 +
2189 +err_lock_parent:
2190 + path_to_nameidata(&parent, nd);
2191 + path_put_conditional(path, nd);
2192 + mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2193 + return err;
2194 +}
2195 +
2196 +/**
2197 + * lookup_rename_source() - lookup the source used by rename
2198 + *
2199 + * This is a special version of _hash_lookup_union() which becomes necessary
2200 + * for finding the source of a rename on union mounts.
2201 + *
2202 + * See comment for _hash_lookup_union() above.
2203 + */
2204 +static int lookup_rename_source(struct nameidata *oldnd,
2205 + struct nameidata *newnd,
2206 + struct dentry **trap, struct qstr *name,
2207 + struct path *old)
2208 {
2209 - struct dentry *dentry;
2210 - struct inode *inode;
2211 + struct path parent = oldnd->path;
2212 + struct path topmost;
2213 int err;
2214
2215 - inode = base->d_inode;
2216 + err = lookup_hash(oldnd, name, old);
2217 + if (err)
2218 + return err;
2219 +
2220 + /* return if we found something and it isn't a directory we are done */
2221 + if (old->dentry->d_inode && !S_ISDIR(old->dentry->d_inode->i_mode))
2222 + return 0;
2223 +
2224 + /* stop lookup if the parent directory is marked opaque */
2225 + if ((IS_OPAQUE(oldnd->path.dentry->d_inode) &&
2226 + !d_is_fallthru(old->dentry)) ||
2227 + d_is_whiteout(old->dentry))
2228 + return 0;
2229 +
2230 + if (!strcmp(old->mnt->mnt_sb->s_type->name, "proc") ||
2231 + !strcmp(old->mnt->mnt_sb->s_type->name, "sysfs"))
2232 + return 0;
2233 +
2234 + unlock_rename(oldnd->path.dentry, newnd->path.dentry);
2235
2236 /*
2237 - * See if the low-level filesystem might want
2238 - * to use its own hash..
2239 + * safe a reference to the topmost parent for walking the union stack
2240 */
2241 - if (base->d_op && base->d_op->d_hash) {
2242 - err = base->d_op->d_hash(base, name);
2243 - dentry = ERR_PTR(err);
2244 - if (err < 0)
2245 - goto out;
2246 + path_get(&parent);
2247 + topmost = *old;
2248 +
2249 + if (old->dentry->d_inode && S_ISDIR(old->dentry->d_inode->i_mode)) {
2250 + err = __hash_lookup_build_union(oldnd, name, old);
2251 + if (err)
2252 + goto err_lock;
2253 + goto out_lock_and_revalidate_parent;
2254 }
2255
2256 - dentry = cached_lookup(base, name, nd);
2257 - if (!dentry) {
2258 - struct dentry *new;
2259 + err = do_union_hash_lookup(oldnd, name, old);
2260 + if (err)
2261 + goto err_lock;
2262
2263 - /* Don't create child dentry for a dead directory. */
2264 - dentry = ERR_PTR(-ENOENT);
2265 - if (IS_DEADDIR(inode))
2266 - goto out;
2267 +out_lock_and_revalidate_parent:
2268 + path_to_nameidata(&parent, oldnd);
2269 + *trap = lock_rename(oldnd->path.dentry, newnd->path.dentry);
2270
2271 - new = d_alloc(base, name);
2272 - dentry = ERR_PTR(-ENOMEM);
2273 - if (!new)
2274 - goto out;
2275 - dentry = inode->i_op->lookup(inode, new, nd);
2276 - if (!dentry)
2277 - dentry = new;
2278 - else
2279 - dput(new);
2280 - }
2281 -out:
2282 - return dentry;
2283 -}
2284 + /*
2285 + * If we return the topmost dentry we have to make sure that it has not
2286 + * been moved away while we gave up the topmost parents i_mutex lock.
2287 + */
2288 + if (topmost.dentry == old->dentry) {
2289 + spin_lock(&old->dentry->d_lock);
2290 + if (oldnd->path.dentry != old->dentry->d_parent) {
2291 + spin_unlock(&old->dentry->d_lock);
2292 + dput(old->dentry);
2293 + name->hash = full_name_hash(name->name, name->len);
2294 + err = lookup_hash(oldnd, name, old);
2295 + if (err)
2296 + return err;
2297 + /* FIXME: What if we find a directory here ... */
2298 + return err;
2299 + }
2300 + spin_unlock(&old->dentry->d_lock);
2301 + } else
2302 + dput(topmost.dentry);
2303
2304 -/*
2305 - * Restricted form of lookup. Doesn't follow links, single-component only,
2306 - * needs parent already locked. Doesn't follow mounts.
2307 - * SMP-safe.
2308 - */
2309 -static struct dentry *lookup_hash(struct nameidata *nd)
2310 -{
2311 - int err;
2312 + return 0;
2313
2314 - err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
2315 - if (err)
2316 - return ERR_PTR(err);
2317 - return __lookup_hash(&nd->last, nd->path.dentry, nd);
2318 +err_lock:
2319 + path_to_nameidata(&parent, oldnd);
2320 + path_put_conditional(old, oldnd);
2321 + *trap = lock_rename(oldnd->path.dentry, newnd->path.dentry);
2322 + return err;
2323 }
2324
2325 static int __lookup_one_len(const char *name, struct qstr *this,
2326 @@ -1502,8 +2335,9 @@ int vfs_create(struct inode *dir, struct
2327 return error;
2328 }
2329
2330 -int may_open(struct path *path, int acc_mode, int flag)
2331 +int may_open(struct nameidata *nd, int acc_mode, int flag)
2332 {
2333 + struct path *path = &nd->path;
2334 struct dentry *dentry = path->dentry;
2335 struct inode *inode = dentry->d_inode;
2336 int error;
2337 @@ -1529,7 +2363,7 @@ int may_open(struct path *path, int acc_
2338 break;
2339 }
2340
2341 - error = inode_permission(inode, acc_mode);
2342 + error = union_permission(path, acc_mode);
2343 if (error)
2344 return error;
2345
2346 @@ -1577,6 +2411,9 @@ int may_open(struct path *path, int acc_
2347 if (!error)
2348 error = security_path_truncate(path, 0,
2349 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
2350 + /* XXX don't copy up file data */
2351 + if (is_unionized(path->dentry, path->mnt))
2352 + error = union_copyup(nd, flag /* XXX not used */);
2353 if (!error) {
2354 vfs_dq_init(inode);
2355
2356 @@ -1623,7 +2460,7 @@ out_unlock:
2357 if (error)
2358 return error;
2359 /* Don't check for write permission, don't truncate */
2360 - return may_open(&nd->path, 0, flag & ~O_TRUNC);
2361 + return may_open(nd, 0, flag & ~O_TRUNC);
2362 }
2363
2364 /*
2365 @@ -1738,12 +2575,10 @@ struct file *do_filp_open(int dfd, const
2366 if (flag & O_EXCL)
2367 nd.flags |= LOOKUP_EXCL;
2368 mutex_lock(&dir->d_inode->i_mutex);
2369 - path.dentry = lookup_hash(&nd);
2370 - path.mnt = nd.path.mnt;
2371 + error = hash_lookup_union(&nd, &nd.last, &path);
2372
2373 do_last:
2374 - error = PTR_ERR(path.dentry);
2375 - if (IS_ERR(path.dentry)) {
2376 + if (error) {
2377 mutex_unlock(&dir->d_inode->i_mutex);
2378 goto exit;
2379 }
2380 @@ -1803,10 +2638,23 @@ do_last:
2381 if (path.dentry->d_inode->i_op->follow_link)
2382 goto do_link;
2383
2384 - path_to_nameidata(&path, &nd);
2385 error = -EISDIR;
2386 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
2387 - goto exit;
2388 + goto exit_dput;
2389 +
2390 + /*
2391 + * If this file is on a lower layer of the union stack, copy it to the
2392 + * topmost layer before opening it
2393 + */
2394 + if (path.dentry->d_inode &&
2395 + (path.dentry->d_parent != dir) &&
2396 + S_ISREG(path.dentry->d_inode->i_mode)) {
2397 + error = __union_copyup(&path, &nd, &path);
2398 + if (error)
2399 + goto exit_dput;
2400 + }
2401 +
2402 + path_to_nameidata(&path, &nd);
2403 ok:
2404 /*
2405 * Consider:
2406 @@ -1824,12 +2672,18 @@ ok:
2407 if (error)
2408 goto exit;
2409 }
2410 - error = may_open(&nd.path, acc_mode, flag);
2411 + error = may_open(&nd, acc_mode, flag);
2412 if (error) {
2413 if (will_write)
2414 mnt_drop_write(nd.path.mnt);
2415 goto exit;
2416 }
2417 + /* Okay, all permissions go, now copy up */
2418 + if (!(flag & O_CREAT) && (flag & FMODE_WRITE)) {
2419 + error = union_copyup(&nd, flag /* XXX not used */);
2420 + if (error)
2421 + goto exit;
2422 + }
2423 filp = nameidata_to_filp(&nd, open_flag);
2424 if (IS_ERR(filp))
2425 ima_counts_put(&nd.path,
2426 @@ -1904,8 +2758,7 @@ do_link:
2427 }
2428 dir = nd.path.dentry;
2429 mutex_lock(&dir->d_inode->i_mutex);
2430 - path.dentry = lookup_hash(&nd);
2431 - path.mnt = nd.path.mnt;
2432 + error = hash_lookup_union(&nd, &nd.last, &path);
2433 __putname(nd.last.name);
2434 goto do_last;
2435 }
2436 @@ -1939,7 +2792,8 @@ EXPORT_SYMBOL(filp_open);
2437 */
2438 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
2439 {
2440 - struct dentry *dentry = ERR_PTR(-EEXIST);
2441 + struct path path = { .dentry = ERR_PTR(-EEXIST) } ;
2442 + int err;
2443
2444 mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2445 /*
2446 @@ -1955,11 +2809,13 @@ struct dentry *lookup_create(struct name
2447 /*
2448 * Do the final lookup.
2449 */
2450 - dentry = lookup_hash(nd);
2451 - if (IS_ERR(dentry))
2452 + err = hash_lookup_union(nd, &nd->last, &path);
2453 + if (err) {
2454 + path.dentry = ERR_PTR(err);
2455 goto fail;
2456 + }
2457
2458 - if (dentry->d_inode)
2459 + if (path.dentry->d_inode)
2460 goto eexist;
2461 /*
2462 * Special case - lookup gave negative, but... we had foo/bar/
2463 @@ -1968,15 +2824,17 @@ struct dentry *lookup_create(struct name
2464 * been asking for (non-existent) directory. -ENOENT for you.
2465 */
2466 if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
2467 - dput(dentry);
2468 - dentry = ERR_PTR(-ENOENT);
2469 + path_put_conditional(&path, nd);
2470 + path.dentry = ERR_PTR(-ENOENT);
2471 }
2472 - return dentry;
2473 + if (nd->path.mnt != path.mnt)
2474 + mntput(path.mnt);
2475 + return path.dentry;
2476 eexist:
2477 - dput(dentry);
2478 - dentry = ERR_PTR(-EEXIST);
2479 + path_put_conditional(&path, nd);
2480 + path.dentry = ERR_PTR(-EEXIST);
2481 fail:
2482 - return dentry;
2483 + return path.dentry;
2484 }
2485 EXPORT_SYMBOL_GPL(lookup_create);
2486
2487 @@ -2088,6 +2946,7 @@ SYSCALL_DEFINE3(mknod, const char __user
2488 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2489 {
2490 int error = may_create(dir, dentry);
2491 + int opaque = 0;
2492
2493 if (error)
2494 return error;
2495 @@ -2101,9 +2960,18 @@ int vfs_mkdir(struct inode *dir, struct
2496 return error;
2497
2498 vfs_dq_init(dir);
2499 +
2500 + if (d_is_whiteout(dentry))
2501 + opaque = 1;
2502 +
2503 error = dir->i_op->mkdir(dir, dentry, mode);
2504 - if (!error)
2505 + if (!error) {
2506 fsnotify_mkdir(dir, dentry);
2507 + if (opaque) {
2508 + dentry->d_inode->i_flags |= S_OPAQUE;
2509 + mark_inode_dirty(dentry->d_inode);
2510 + }
2511 + }
2512 return error;
2513 }
2514
2515 @@ -2149,6 +3017,212 @@ SYSCALL_DEFINE2(mkdir, const char __user
2516 return sys_mkdirat(AT_FDCWD, pathname, mode);
2517 }
2518
2519 +
2520 +/* Checks on the victim for whiteout */
2521 +static inline int may_whiteout(struct inode *dir, struct dentry *victim,
2522 + int isdir)
2523 +{
2524 + int err;
2525 +
2526 + /* from may_create() */
2527 + if (IS_DEADDIR(dir))
2528 + return -ENOENT;
2529 + err = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2530 + if (err)
2531 + return err;
2532 +
2533 + /* from may_delete() */
2534 + if (IS_APPEND(dir))
2535 + return -EPERM;
2536 + if (!victim->d_inode)
2537 + return 0;
2538 + if (check_sticky(dir, victim->d_inode) ||
2539 + IS_APPEND(victim->d_inode) ||
2540 + IS_IMMUTABLE(victim->d_inode))
2541 + return -EPERM;
2542 + if (isdir) {
2543 + if (!S_ISDIR(victim->d_inode->i_mode))
2544 + return -ENOTDIR;
2545 + if (IS_ROOT(victim))
2546 + return -EBUSY;
2547 + } else if (S_ISDIR(victim->d_inode->i_mode))
2548 + return -EISDIR;
2549 + if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2550 + return -EBUSY;
2551 + return 0;
2552 +}
2553 +
2554 +/**
2555 + * vfs_whiteout: creates a white-out for the given directory entry
2556 + * @dir: parent inode
2557 + * @dentry: directory entry to white-out
2558 + *
2559 + * Simply white-out a given directory entry. This functionality is usually used
2560 + * in the sense of unlink. Therefore the given dentry can still be in-use and
2561 + * contains an in-use inode. The filesystem has to do what unlink or rmdir
2562 + * would in that case. Since the dentry still might be in-use we have to
2563 + * provide a fresh unhashed dentry that whiteout can fill the new inode into.
2564 + * In that case the given dentry is dropped and the fresh dentry containing the
2565 + * whiteout is rehashed instead. If the given dentry is unused, the whiteout
2566 + * inode is instantiated into it instead.
2567 + *
2568 + * After this returns with success, don't make any assumptions about the inode.
2569 + * Just dput() it dentry.
2570 + */
2571 +static int vfs_whiteout(struct inode *dir, struct dentry *dentry, int isdir)
2572 +{
2573 + int err;
2574 + struct inode *old_inode = dentry->d_inode;
2575 + struct dentry *parent, *whiteout;
2576 +
2577 + err = may_whiteout(dir, dentry, isdir);
2578 + if (err)
2579 + return err;
2580 +
2581 + BUG_ON(dentry->d_parent->d_inode != dir);
2582 +
2583 + if (!dir->i_op || !dir->i_op->whiteout)
2584 + return -EOPNOTSUPP;
2585 +
2586 + if (old_inode) {
2587 + vfs_dq_init(dir);
2588 +
2589 + mutex_lock(&old_inode->i_mutex);
2590 + if (isdir)
2591 + dentry_unhash(dentry);
2592 + if (d_mountpoint(dentry))
2593 + err = -EBUSY;
2594 + else {
2595 + if (isdir)
2596 + err = security_inode_rmdir(dir, dentry);
2597 + else
2598 + err = security_inode_unlink(dir, dentry);
2599 + }
2600 + }
2601 +
2602 + parent = dget_parent(dentry);
2603 + whiteout = d_alloc_name(parent, dentry->d_name.name);
2604 +
2605 + if (!err)
2606 + err = dir->i_op->whiteout(dir, dentry, whiteout);
2607 +
2608 + if (old_inode) {
2609 + mutex_unlock(&old_inode->i_mutex);
2610 + if (!err) {
2611 + fsnotify_link_count(old_inode);
2612 + d_delete(dentry);
2613 + }
2614 + if (isdir)
2615 + dput(dentry);
2616 + }
2617 +
2618 + dput(whiteout);
2619 + dput(parent);
2620 + return err;
2621 +}
2622 +
2623 +int path_whiteout(struct path *dir_path, struct dentry *dentry, int isdir)
2624 +{
2625 + int error = mnt_want_write(dir_path->mnt);
2626 +
2627 + if (!error) {
2628 + error = vfs_whiteout(dir_path->dentry->d_inode, dentry, isdir);
2629 + mnt_drop_write(dir_path->mnt);
2630 + }
2631 +
2632 + return error;
2633 +}
2634 +EXPORT_SYMBOL(path_whiteout);
2635 +
2636 +/*
2637 + * This is abusing readdir to check if a union directory is logically empty.
2638 + * Al Viro barfed when he saw this, but Val said: "Well, at this point I'm
2639 + * aiming for working, pretty can come later"
2640 + */
2641 +static int filldir_is_empty(void *__buf, const char *name, int namlen,
2642 + loff_t offset, u64 ino, unsigned int d_type)
2643 +{
2644 + int *is_empty = (int *)__buf;
2645 +
2646 + switch (namlen) {
2647 + case 2:
2648 + if (name[1] != '.')
2649 + break;
2650 + case 1:
2651 + if (name[0] != '.')
2652 + break;
2653 + return 0;
2654 + }
2655 +
2656 + if (d_type == DT_WHT)
2657 + return 0;
2658 +
2659 + (*is_empty) = 0;
2660 + return 0;
2661 +}
2662 +
2663 +static int directory_is_empty(struct dentry *dentry, struct vfsmount *mnt)
2664 +{
2665 + struct file *file;
2666 + int err;
2667 + int is_empty = 1;
2668 +
2669 + BUG_ON(!S_ISDIR(dentry->d_inode->i_mode));
2670 +
2671 + /* references for the file pointer */
2672 + dget(dentry);
2673 + mntget(mnt);
2674 +
2675 + file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
2676 + if (IS_ERR(file))
2677 + return 0;
2678 +
2679 + err = vfs_readdir(file, filldir_is_empty, &is_empty);
2680 +
2681 + fput(file);
2682 + return is_empty;
2683 +}
2684 +
2685 +static int do_whiteout(struct nameidata *nd, struct path *path, int isdir)
2686 +{
2687 + struct path safe = { .dentry = dget(nd->path.dentry),
2688 + .mnt = mntget(nd->path.mnt) };
2689 + struct dentry *dentry = path->dentry;
2690 + int err;
2691 +
2692 + err = may_whiteout(nd->path.dentry->d_inode, dentry, isdir);
2693 + if (err)
2694 + goto out;
2695 +
2696 + err = -ENOENT;
2697 + if (!dentry->d_inode)
2698 + goto out;
2699 +
2700 + err = -ENOTEMPTY;
2701 + if (isdir && !directory_is_empty(path->dentry, path->mnt))
2702 + goto out;
2703 +
2704 + if (nd->path.dentry != dentry->d_parent) {
2705 + dentry = __lookup_hash(&path->dentry->d_name, nd->path.dentry,
2706 + nd);
2707 + err = PTR_ERR(dentry);
2708 + if (IS_ERR(dentry))
2709 + goto out;
2710 +
2711 + dput(path->dentry);
2712 + if (path->mnt != safe.mnt)
2713 + mntput(path->mnt);
2714 + path->mnt = nd->path.mnt;
2715 + path->dentry = dentry;
2716 + }
2717 +
2718 + err = vfs_whiteout(nd->path.dentry->d_inode, dentry, isdir);
2719 +
2720 +out:
2721 + path_put(&safe);
2722 + return err;
2723 +}
2724 +
2725 /*
2726 * We try to drop the dentry early: we should have
2727 * a usage count of 2 if we're the only user of this
2728 @@ -2213,7 +3287,7 @@ static long do_rmdir(int dfd, const char
2729 {
2730 int error = 0;
2731 char * name;
2732 - struct dentry *dentry;
2733 + struct path path;
2734 struct nameidata nd;
2735
2736 error = user_path_parent(dfd, pathname, &nd, &name);
2737 @@ -2235,21 +3309,24 @@ static long do_rmdir(int dfd, const char
2738 nd.flags &= ~LOOKUP_PARENT;
2739
2740 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2741 - dentry = lookup_hash(&nd);
2742 - error = PTR_ERR(dentry);
2743 - if (IS_ERR(dentry))
2744 + error = hash_lookup_union(&nd, &nd.last, &path);
2745 + if (error)
2746 goto exit2;
2747 + if (is_unionized(nd.path.dentry, nd.path.mnt)) {
2748 + error = do_whiteout(&nd, &path, 1);
2749 + goto exit3;
2750 + }
2751 error = mnt_want_write(nd.path.mnt);
2752 if (error)
2753 goto exit3;
2754 - error = security_path_rmdir(&nd.path, dentry);
2755 + error = security_path_rmdir(&nd.path, path.dentry);
2756 if (error)
2757 goto exit4;
2758 - error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2759 + error = vfs_rmdir(nd.path.dentry->d_inode, path.dentry);
2760 exit4:
2761 mnt_drop_write(nd.path.mnt);
2762 exit3:
2763 - dput(dentry);
2764 + path_put_conditional(&path, &nd);
2765 exit2:
2766 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2767 exit1:
2768 @@ -2304,7 +3381,7 @@ static long do_unlinkat(int dfd, const c
2769 {
2770 int error;
2771 char *name;
2772 - struct dentry *dentry;
2773 + struct path path;
2774 struct nameidata nd;
2775 struct inode *inode = NULL;
2776
2777 @@ -2319,26 +3396,29 @@ static long do_unlinkat(int dfd, const c
2778 nd.flags &= ~LOOKUP_PARENT;
2779
2780 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2781 - dentry = lookup_hash(&nd);
2782 - error = PTR_ERR(dentry);
2783 - if (!IS_ERR(dentry)) {
2784 + error = hash_lookup_union(&nd, &nd.last, &path);
2785 + if (!error) {
2786 /* Why not before? Because we want correct error value */
2787 if (nd.last.name[nd.last.len])
2788 goto slashes;
2789 - inode = dentry->d_inode;
2790 + inode = path.dentry->d_inode;
2791 if (inode)
2792 atomic_inc(&inode->i_count);
2793 + if (is_unionized(nd.path.dentry, nd.path.mnt)) {
2794 + error = do_whiteout(&nd, &path, 0);
2795 + goto exit2;
2796 + }
2797 error = mnt_want_write(nd.path.mnt);
2798 if (error)
2799 goto exit2;
2800 - error = security_path_unlink(&nd.path, dentry);
2801 + error = security_path_unlink(&nd.path, path.dentry);
2802 if (error)
2803 goto exit3;
2804 - error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2805 + error = vfs_unlink(nd.path.dentry->d_inode, path.dentry);
2806 exit3:
2807 mnt_drop_write(nd.path.mnt);
2808 exit2:
2809 - dput(dentry);
2810 + path_put_conditional(&path, &nd);
2811 }
2812 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2813 if (inode)
2814 @@ -2349,8 +3429,8 @@ exit1:
2815 return error;
2816
2817 slashes:
2818 - error = !dentry->d_inode ? -ENOENT :
2819 - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2820 + error = !path.dentry->d_inode ? -ENOENT :
2821 + S_ISDIR(path.dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2822 goto exit2;
2823 }
2824
2825 @@ -2686,11 +3766,96 @@ int vfs_rename(struct inode *old_dir, st
2826 return error;
2827 }
2828
2829 +static int vfs_rename_union(struct nameidata *oldnd, struct path *old,
2830 + struct nameidata *newnd, struct path *new)
2831 +{
2832 + struct inode *old_dir = oldnd->path.dentry->d_inode;
2833 + struct inode *new_dir = newnd->path.dentry->d_inode;
2834 + struct qstr old_name;
2835 + char *name;
2836 + struct dentry *dentry;
2837 + int error;
2838 +
2839 + if (old->dentry->d_inode == new->dentry->d_inode)
2840 + return 0;
2841 + error = may_whiteout(old_dir, old->dentry, 0);
2842 + if (error)
2843 + return error;
2844 + if (!old_dir->i_op || !old_dir->i_op->whiteout)
2845 + return -EPERM;
2846 +
2847 + if (!new->dentry->d_inode)
2848 + error = may_create(new_dir, new->dentry);
2849 + else
2850 + error = may_delete(new_dir, new->dentry, 0);
2851 + if (error)
2852 + return error;
2853 +
2854 + vfs_dq_init(old_dir);
2855 + vfs_dq_init(new_dir);
2856 +
2857 + error = -EBUSY;
2858 + if (d_mountpoint(old->dentry) || d_mountpoint(new->dentry))
2859 + return error;
2860 +
2861 + error = -ENOMEM;
2862 + name = kmalloc(old->dentry->d_name.len, GFP_KERNEL);
2863 + if (!name)
2864 + return error;
2865 + strncpy(name, old->dentry->d_name.name, old->dentry->d_name.len);
2866 + name[old->dentry->d_name.len] = 0;
2867 + old_name.len = old->dentry->d_name.len;
2868 + old_name.hash = old->dentry->d_name.hash;
2869 + old_name.name = name;
2870 +
2871 + /* possibly delete the existing new file */
2872 + if ((newnd->path.dentry == new->dentry->d_parent) &&
2873 + new->dentry->d_inode) {
2874 + /* FIXME: inode may be truncated while we hold a lock */
2875 + error = vfs_unlink(new_dir, new->dentry);
2876 + if (error)
2877 + goto freename;
2878 +
2879 + dentry = __lookup_hash(&new->dentry->d_name,
2880 + newnd->path.dentry, newnd);
2881 + if (IS_ERR(dentry))
2882 + goto freename;
2883 +
2884 + dput(new->dentry);
2885 + new->dentry = dentry;
2886 + }
2887 +
2888 + /* copyup to the new file */
2889 + error = __union_copyup(old, newnd, new);
2890 + if (error)
2891 + goto freename;
2892 +
2893 + /* whiteout the old file */
2894 + dentry = __lookup_hash(&old_name, oldnd->path.dentry, oldnd);
2895 + error = PTR_ERR(dentry);
2896 + if (IS_ERR(dentry))
2897 + goto freename;
2898 + error = vfs_whiteout(old_dir, dentry, 0);
2899 + dput(dentry);
2900 +
2901 + /* FIXME: This is acutally unlink() && create() ... */
2902 +/*
2903 + if (!error) {
2904 + const char *new_name = old_dentry->d_name.name;
2905 + fsnotify_move(old_dir, new_dir, old_name.name, new_name, 0,
2906 + new_dentry->d_inode, old_dentry->d_inode);
2907 + }
2908 +*/
2909 +freename:
2910 + kfree(old_name.name);
2911 + return error;
2912 +}
2913 +
2914 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
2915 int, newdfd, const char __user *, newname)
2916 {
2917 struct dentry *old_dir, *new_dir;
2918 - struct dentry *old_dentry, *new_dentry;
2919 + struct path old, new;
2920 struct dentry *trap;
2921 struct nameidata oldnd, newnd;
2922 char *from;
2923 @@ -2724,16 +3889,28 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c
2924
2925 trap = lock_rename(new_dir, old_dir);
2926
2927 - old_dentry = lookup_hash(&oldnd);
2928 - error = PTR_ERR(old_dentry);
2929 - if (IS_ERR(old_dentry))
2930 + /*
2931 + * For union mounts we need to call a giant lookup_rename_source()
2932 + * instead.
2933 + * First lock_rename() and look on the topmost fs like you would do in
2934 + * the normal rename, if you find something which is not a directory,
2935 + * go ahead and lookup target and do normal rename.
2936 + * If you find a negative dentry, unlock_rename() and continue as
2937 + * _hash_lookup_union() would do without locking the topmost parent
2938 + * at the end. After that do lock_rename() of the source parent and the
2939 + * target parent and do a copyup with additional whiteout creation at
2940 + * the end.
2941 + */
2942 +// error = hash_lookup_union(&oldnd, &oldnd.last, &old);
2943 + error = lookup_rename_source(&oldnd, &newnd, &trap, &oldnd.last, &old);
2944 + if (error)
2945 goto exit3;
2946 /* source must exist */
2947 error = -ENOENT;
2948 - if (!old_dentry->d_inode)
2949 + if (!old.dentry->d_inode)
2950 goto exit4;
2951 /* unless the source is a directory trailing slashes give -ENOTDIR */
2952 - if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2953 + if (!S_ISDIR(old.dentry->d_inode->i_mode)) {
2954 error = -ENOTDIR;
2955 if (oldnd.last.name[oldnd.last.len])
2956 goto exit4;
2957 @@ -2742,32 +3919,44 @@ SYSCALL_DEFINE4(renameat, int, olddfd, c
2958 }
2959 /* source should not be ancestor of target */
2960 error = -EINVAL;
2961 - if (old_dentry == trap)
2962 + if (old.dentry == trap)
2963 goto exit4;
2964 - new_dentry = lookup_hash(&newnd);
2965 - error = PTR_ERR(new_dentry);
2966 - if (IS_ERR(new_dentry))
2967 + /* target is always on topmost fs, even with unions */
2968 + error = lookup_hash(&newnd, &newnd.last, &new);
2969 + if (error)
2970 goto exit4;
2971 /* target should not be an ancestor of source */
2972 error = -ENOTEMPTY;
2973 - if (new_dentry == trap)
2974 + if (new.dentry == trap)
2975 + goto exit5;
2976 + /* renaming of directories on unions is done by the user-space */
2977 + error = -EXDEV;
2978 + if (is_unionized(oldnd.path.dentry, oldnd.path.mnt) &&
2979 + S_ISDIR(old.dentry->d_inode->i_mode))
2980 goto exit5;
2981 +// if (is_unionized(newnd.path.dentry, newnd.path.mnt))
2982 +// goto exit5;
2983
2984 error = mnt_want_write(oldnd.path.mnt);
2985 if (error)
2986 goto exit5;
2987 - error = security_path_rename(&oldnd.path, old_dentry,
2988 - &newnd.path, new_dentry);
2989 + error = security_path_rename(&oldnd.path, old.dentry,
2990 + &newnd.path, new.dentry);
2991 if (error)
2992 goto exit6;
2993 - error = vfs_rename(old_dir->d_inode, old_dentry,
2994 - new_dir->d_inode, new_dentry);
2995 + if (is_unionized(oldnd.path.dentry, oldnd.path.mnt) &&
2996 + (old.dentry->d_parent != oldnd.path.dentry)) {
2997 + error = vfs_rename_union(&oldnd, &old, &newnd, &new);
2998 + goto exit6;
2999 + }
3000 + error = vfs_rename(old_dir->d_inode, old.dentry,
3001 + new_dir->d_inode, new.dentry);
3002 exit6:
3003 mnt_drop_write(oldnd.path.mnt);
3004 exit5:
3005 - dput(new_dentry);
3006 + path_put_conditional(&new, &newnd);
3007 exit4:
3008 - dput(old_dentry);
3009 + path_put_conditional(&old, &oldnd);
3010 exit3:
3011 unlock_rename(new_dir, old_dir);
3012 exit2:
3013 --- a/fs/namespace.c
3014 +++ b/fs/namespace.c
3015 @@ -29,6 +29,7 @@
3016 #include <linux/log2.h>
3017 #include <linux/idr.h>
3018 #include <linux/fs_struct.h>
3019 +#include <linux/union.h>
3020 #include <asm/uaccess.h>
3021 #include <asm/unistd.h>
3022 #include "pnode.h"
3023 @@ -150,6 +151,9 @@ struct vfsmount *alloc_vfsmnt(const char
3024 INIT_LIST_HEAD(&mnt->mnt_share);
3025 INIT_LIST_HEAD(&mnt->mnt_slave_list);
3026 INIT_LIST_HEAD(&mnt->mnt_slave);
3027 +#ifdef CONFIG_UNION_MOUNT
3028 + INIT_LIST_HEAD(&mnt->mnt_unions);
3029 +#endif
3030 #ifdef CONFIG_SMP
3031 mnt->mnt_writers = alloc_percpu(int);
3032 if (!mnt->mnt_writers)
3033 @@ -469,6 +473,7 @@ static void __touch_mnt_namespace(struct
3034
3035 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
3036 {
3037 + detach_mnt_union(mnt);
3038 old_path->dentry = mnt->mnt_mountpoint;
3039 old_path->mnt = mnt->mnt_parent;
3040 mnt->mnt_parent = mnt;
3041 @@ -492,6 +497,7 @@ static void attach_mnt(struct vfsmount *
3042 list_add_tail(&mnt->mnt_hash, mount_hashtable +
3043 hash(path->mnt, path->dentry));
3044 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
3045 + attach_mnt_union(mnt, path->mnt, path->dentry);
3046 }
3047
3048 /*
3049 @@ -514,6 +520,7 @@ static void commit_tree(struct vfsmount
3050 list_add_tail(&mnt->mnt_hash, mount_hashtable +
3051 hash(parent, mnt->mnt_mountpoint));
3052 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
3053 + attach_mnt_union(mnt, mnt->mnt_parent, mnt->mnt_mountpoint);
3054 touch_mnt_namespace(n);
3055 }
3056
3057 @@ -770,6 +777,7 @@ static void show_mnt_opts(struct seq_fil
3058 { MNT_NODIRATIME, ",nodiratime" },
3059 { MNT_RELATIME, ",relatime" },
3060 { MNT_STRICTATIME, ",strictatime" },
3061 + { MNT_UNION, ",union" },
3062 { 0, NULL }
3063 };
3064 const struct proc_fs_info *fs_infop;
3065 @@ -984,6 +992,7 @@ void release_mounts(struct list_head *he
3066 struct dentry *dentry;
3067 struct vfsmount *m;
3068 spin_lock(&vfsmount_lock);
3069 + detach_mnt_union(mnt);
3070 dentry = mnt->mnt_mountpoint;
3071 m = mnt->mnt_parent;
3072 mnt->mnt_mountpoint = mnt->mnt_root;
3073 @@ -1102,6 +1111,11 @@ static int do_umount(struct vfsmount *mn
3074 spin_unlock(&vfsmount_lock);
3075 if (retval)
3076 security_sb_umount_busy(mnt);
3077 + /* If this was a union mount, we are no longer a read-only
3078 + * user on the underlying mount */
3079 + if (mnt->mnt_flags & MNT_UNION)
3080 + mnt->mnt_parent->mnt_sb->s_readonly_users--;
3081 +
3082 up_write(&namespace_sem);
3083 release_mounts(&umount_list);
3084 return retval;
3085 @@ -1426,6 +1440,10 @@ static int do_change_type(struct path *p
3086 if (path->dentry != path->mnt->mnt_root)
3087 return -EINVAL;
3088
3089 + /* Don't change the type of union mounts */
3090 + if (IS_MNT_UNION(path->mnt))
3091 + return -EINVAL;
3092 +
3093 down_write(&namespace_sem);
3094 if (type == MS_SHARED) {
3095 err = invent_group_ids(mnt, recurse);
3096 @@ -1444,10 +1462,65 @@ static int do_change_type(struct path *p
3097 }
3098
3099 /*
3100 + * Mount-time check of upper and lower layer file systems to see if we
3101 + * can union mount one on the other.
3102 + *
3103 + * Union mounts must follow these rules:
3104 + *
3105 + * - The lower layer must be read-only. This avoids lots of nasty
3106 + * unsolvable races where file system structures disappear suddenly.
3107 + * XXX - Checking the vfsmnt for read-only is a temporary hack; the
3108 + * file system could be mounted read-write elsewhere. We need to
3109 + * enforce read-only at the superblock level (patches coming).
3110 + *
3111 + * - The upper layer must be writable. This isn't an absolute
3112 + * requirement; right now we need it to make readdir() work since we
3113 + * copy up directory entries to the top level. A possible
3114 + * workaround is to mount a tmpfs file system transparently over the
3115 + * top.
3116 + *
3117 + * - The upper layer must support whiteouts and fallthrus (if it is
3118 + * writeable).
3119 + *
3120 + * - The lower layer must not also be a union mount. This is just to
3121 + * make life simpler for now, there is no inherent limitation on the
3122 + * number of layers.
3123 + *
3124 + * XXX - Check other mount flags for incompatibilities - I'm sure
3125 + * there are some.
3126 + */
3127 +
3128 +static int
3129 +check_union_mnt(struct path *mntpnt, struct vfsmount *top_mnt, int mnt_flags)
3130 +{
3131 + struct vfsmount *lower_mnt = mntpnt->mnt;
3132 +
3133 + /* Is this even a union mount? */
3134 + if (!(mnt_flags & MNT_UNION))
3135 + return 0;
3136 +
3137 + /* Lower layer must be read-only and not a union mount */
3138 + if (!(lower_mnt->mnt_sb->s_flags & MS_RDONLY) ||
3139 + (lower_mnt->mnt_flags & MNT_UNION))
3140 + return -EBUSY;
3141 +
3142 + /* Upper layer must be writable */
3143 + if (mnt_flags & MNT_READONLY)
3144 + return -EROFS;
3145 +
3146 + /* Upper layer must support whiteouts and fallthrus */
3147 + if (!(top_mnt->mnt_sb->s_flags & MS_WHITEOUT))
3148 + return -EINVAL;
3149 +
3150 + /* All good! */
3151 + return 0;
3152 +}
3153 +
3154 +/*
3155 * do loopback mount.
3156 */
3157 -static int do_loopback(struct path *path, char *old_name,
3158 - int recurse)
3159 +static int do_loopback(struct path *path, char *old_name, int recurse,
3160 + int mnt_flags)
3161 {
3162 struct path old_path;
3163 struct vfsmount *mnt = NULL;
3164 @@ -1477,6 +1550,13 @@ static int do_loopback(struct path *path
3165 if (!mnt)
3166 goto out;
3167
3168 + err = check_union_mnt(&old_path, mnt, mnt_flags);
3169 + if (err)
3170 + goto out;
3171 +
3172 + if (mnt_flags & MNT_UNION)
3173 + mnt->mnt_flags |= MNT_UNION;
3174 +
3175 err = graft_tree(mnt, path);
3176 if (err) {
3177 LIST_HEAD(umount_list);
3178 @@ -1486,6 +1566,10 @@ static int do_loopback(struct path *path
3179 release_mounts(&umount_list);
3180 }
3181
3182 + /* If this is a union mount, add ourselves to the readonly users */
3183 + if (mnt_flags & MNT_UNION)
3184 + mnt->mnt_parent->mnt_sb->s_readonly_users++;
3185 +
3186 out:
3187 up_write(&namespace_sem);
3188 path_put(&old_path);
3189 @@ -1570,6 +1654,13 @@ static int do_move_mount(struct path *pa
3190 if (err)
3191 return err;
3192
3193 + /* moving to or from a union mount is not supported */
3194 + err = -EINVAL;
3195 + if (IS_MNT_UNION(path->mnt))
3196 + goto exit;
3197 + if (IS_MNT_UNION(old_path.mnt))
3198 + goto exit;
3199 +
3200 down_write(&namespace_sem);
3201 while (d_mountpoint(path->dentry) &&
3202 follow_down(path))
3203 @@ -1627,6 +1718,7 @@ out:
3204 up_write(&namespace_sem);
3205 if (!err)
3206 path_put(&parent_path);
3207 +exit:
3208 path_put(&old_path);
3209 return err;
3210 }
3211 @@ -1684,10 +1776,18 @@ int do_add_mount(struct vfsmount *newmnt
3212 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
3213 goto unlock;
3214
3215 + err = check_union_mnt(path, newmnt, mnt_flags);
3216 + if (err)
3217 + goto unlock;
3218 +
3219 newmnt->mnt_flags = mnt_flags;
3220 if ((err = graft_tree(newmnt, path)))
3221 goto unlock;
3222
3223 + /* If this is a union mount, add ourselves to the readonly users */
3224 + if (mnt_flags & MNT_UNION)
3225 + newmnt->mnt_parent->mnt_sb->s_readonly_users++;
3226 +
3227 if (fslist) /* add to the specified expiration list */
3228 list_add_tail(&newmnt->mnt_expire, fslist);
3229
3230 @@ -1940,10 +2040,12 @@ long do_mount(char *dev_name, char *dir_
3231 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3232 if (flags & MS_RDONLY)
3233 mnt_flags |= MNT_READONLY;
3234 + if (flags & MS_UNION)
3235 + mnt_flags |= MNT_UNION;
3236
3237 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
3238 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
3239 - MS_STRICTATIME);
3240 + MS_STRICTATIME | MS_UNION);
3241
3242 /* ... and get the mountpoint */
3243 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
3244 @@ -1959,7 +2061,8 @@ long do_mount(char *dev_name, char *dir_
3245 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
3246 data_page);
3247 else if (flags & MS_BIND)
3248 - retval = do_loopback(&path, dev_name, flags & MS_REC);
3249 + retval = do_loopback(&path, dev_name, flags & MS_REC,
3250 + mnt_flags);
3251 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3252 retval = do_change_type(&path, flags);
3253 else if (flags & MS_MOVE)
3254 @@ -2196,6 +2299,8 @@ SYSCALL_DEFINE2(pivot_root, const char _
3255 if (d_unlinked(old.dentry))
3256 goto out2;
3257 error = -EBUSY;
3258 + follow_union_down(&new);
3259 + follow_union_down(&root);
3260 if (new.mnt == root.mnt ||
3261 old.mnt == root.mnt)
3262 goto out2; /* loop, on the same file system */
3263 --- a/fs/nfsctl.c
3264 +++ b/fs/nfsctl.c
3265 @@ -38,10 +38,10 @@ static struct file *do_open(char *name,
3266 return ERR_PTR(error);
3267
3268 if (flags == O_RDWR)
3269 - error = may_open(&nd.path, MAY_READ|MAY_WRITE,
3270 - FMODE_READ|FMODE_WRITE);
3271 + error = may_open(&nd, MAY_READ|MAY_WRITE,
3272 + FMODE_READ|FMODE_WRITE);
3273 else
3274 - error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
3275 + error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
3276
3277 if (!error)
3278 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
3279 --- a/fs/nfsd/nfs3xdr.c
3280 +++ b/fs/nfsd/nfs3xdr.c
3281 @@ -898,6 +898,11 @@ encode_entry(struct readdir_cd *ccd, con
3282 int elen; /* estimated entry length in words */
3283 int num_entry_words = 0; /* actual number of words */
3284
3285 + if (d_type == DT_WHT) {
3286 + cd->common.err = nfs_ok;
3287 + return 0;
3288 + }
3289 +
3290 if (cd->offset) {
3291 u64 offset64 = offset;
3292
3293 --- a/fs/nfsd/nfs4xdr.c
3294 +++ b/fs/nfsd/nfs4xdr.c
3295 @@ -2261,7 +2261,7 @@ nfsd4_encode_dirent(void *ccdv, const ch
3296 __be32 nfserr = nfserr_toosmall;
3297
3298 /* In nfsv4, "." and ".." never make it onto the wire.. */
3299 - if (name && isdotent(name, namlen)) {
3300 + if (d_type == DT_WHT || (name && isdotent(name, namlen))) {
3301 cd->common.err = nfs_ok;
3302 return 0;
3303 }
3304 --- a/fs/nfsd/nfsxdr.c
3305 +++ b/fs/nfsd/nfsxdr.c
3306 @@ -513,6 +513,10 @@ nfssvc_encode_entry(void *ccdv, const ch
3307 namlen, name, offset, ino);
3308 */
3309
3310 + if (d_type == DT_WHT) {
3311 + cd->common.err = nfs_ok;
3312 + return 0;
3313 + }
3314 if (offset > ~((u32) 0)) {
3315 cd->common.err = nfserr_fbig;
3316 return -EINVAL;
3317 --- a/fs/open.c
3318 +++ b/fs/open.c
3319 @@ -30,6 +30,7 @@
3320 #include <linux/audit.h>
3321 #include <linux/falloc.h>
3322 #include <linux/fs_struct.h>
3323 +#include <linux/union.h>
3324
3325 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
3326 {
3327 @@ -224,69 +225,69 @@ int do_truncate(struct dentry *dentry, l
3328 return ret;
3329 }
3330
3331 -static long do_sys_truncate(const char __user *pathname, loff_t length)
3332 +static int __do_ftruncate(struct file *file, unsigned long length, int small)
3333 {
3334 - struct path path;
3335 - struct inode *inode;
3336 + struct inode * inode;
3337 + struct dentry *dentry;
3338 int error;
3339
3340 error = -EINVAL;
3341 - if (length < 0) /* sorry, but loff_t says... */
3342 + if (length < 0)
3343 goto out;
3344 + /* explicitly opened as large or we are on 64-bit box */
3345 + if (file->f_flags & O_LARGEFILE)
3346 + small = 0;
3347
3348 - error = user_path(pathname, &path);
3349 - if (error)
3350 + dentry = file->f_path.dentry;
3351 + inode = dentry->d_inode;
3352 + error = -EINVAL;
3353 + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
3354 goto out;
3355 - inode = path.dentry->d_inode;
3356 -
3357 - /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
3358 - error = -EISDIR;
3359 - if (S_ISDIR(inode->i_mode))
3360 - goto dput_and_out;
3361
3362 error = -EINVAL;
3363 - if (!S_ISREG(inode->i_mode))
3364 - goto dput_and_out;
3365 -
3366 - error = mnt_want_write(path.mnt);
3367 - if (error)
3368 - goto dput_and_out;
3369 + /* Cannot ftruncate over 2^31 bytes without large file support */
3370 + if (small && length > MAX_NON_LFS)
3371
3372 - error = inode_permission(inode, MAY_WRITE);
3373 - if (error)
3374 - goto mnt_drop_write_and_out;
3375 + goto out;
3376
3377 error = -EPERM;
3378 if (IS_APPEND(inode))
3379 - goto mnt_drop_write_and_out;
3380 + goto out;
3381
3382 - error = get_write_access(inode);
3383 - if (error)
3384 - goto mnt_drop_write_and_out;
3385 + error = locks_verify_truncate(inode, file, length);
3386 + if (!error)
3387 + error = security_path_truncate(&file->f_path, length,
3388 + ATTR_MTIME|ATTR_CTIME);
3389 + if (!error)
3390 + /* Already copied up for union, opened with write */
3391 + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
3392 +out:
3393 + return error;
3394 +}
3395
3396 - /*
3397 - * Make sure that there are no leases. get_write_access() protects
3398 - * against the truncate racing with a lease-granting setlease().
3399 - */
3400 - error = break_lease(inode, FMODE_WRITE);
3401 - if (error)
3402 - goto put_write_and_out;
3403 +static long do_sys_truncate(const char __user *pathname, loff_t length)
3404 +{
3405 + struct file *file;
3406 + char *tmp;
3407 + int error;
3408
3409 - error = locks_verify_truncate(inode, NULL, length);
3410 - if (!error)
3411 - error = security_path_truncate(&path, length, 0);
3412 - if (!error) {
3413 - vfs_dq_init(inode);
3414 - error = do_truncate(path.dentry, length, 0, NULL);
3415 - }
3416 + error = -EINVAL;
3417 + if (length < 0) /* sorry, but loff_t says... */
3418 + return error;
3419
3420 -put_write_and_out:
3421 - put_write_access(inode);
3422 -mnt_drop_write_and_out:
3423 - mnt_drop_write(path.mnt);
3424 -dput_and_out:
3425 - path_put(&path);
3426 -out:
3427 + tmp = getname(pathname);
3428 + if (IS_ERR(tmp))
3429 + return PTR_ERR(tmp);
3430 +
3431 + file = filp_open(tmp, O_RDWR | O_LARGEFILE, 0);
3432 + putname(tmp);
3433 +
3434 + if (IS_ERR(file))
3435 + return PTR_ERR(file);
3436 +
3437 + error = __do_ftruncate(file, length, 0);
3438 +
3439 + fput(file);
3440 return error;
3441 }
3442
3443 @@ -297,45 +298,16 @@ SYSCALL_DEFINE2(truncate, const char __u
3444
3445 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
3446 {
3447 - struct inode * inode;
3448 - struct dentry *dentry;
3449 struct file * file;
3450 int error;
3451
3452 - error = -EINVAL;
3453 - if (length < 0)
3454 - goto out;
3455 error = -EBADF;
3456 file = fget(fd);
3457 if (!file)
3458 goto out;
3459
3460 - /* explicitly opened as large or we are on 64-bit box */
3461 - if (file->f_flags & O_LARGEFILE)
3462 - small = 0;
3463 -
3464 - dentry = file->f_path.dentry;
3465 - inode = dentry->d_inode;
3466 - error = -EINVAL;
3467 - if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
3468 - goto out_putf;
3469 -
3470 - error = -EINVAL;
3471 - /* Cannot ftruncate over 2^31 bytes without large file support */
3472 - if (small && length > MAX_NON_LFS)
3473 - goto out_putf;
3474 + error = __do_ftruncate(file, length, small);
3475
3476 - error = -EPERM;
3477 - if (IS_APPEND(inode))
3478 - goto out_putf;
3479 -
3480 - error = locks_verify_truncate(inode, file, length);
3481 - if (!error)
3482 - error = security_path_truncate(&file->f_path, length,
3483 - ATTR_MTIME|ATTR_CTIME);
3484 - if (!error)
3485 - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
3486 -out_putf:
3487 fput(file);
3488 out:
3489 return error;
3490 @@ -494,7 +466,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, con
3491 goto out_path_release;
3492 }
3493
3494 - res = inode_permission(inode, mode | MAY_ACCESS);
3495 + res = union_permission(&path, mode | MAY_ACCESS);
3496 +
3497 /* SuS v2 requires we report a read only fs too */
3498 if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
3499 goto out_path_release;
3500 @@ -508,7 +481,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, con
3501 * inherently racy and know that the fs may change
3502 * state before we even see this result.
3503 */
3504 - if (__mnt_is_readonly(path.mnt))
3505 + if ((!is_unionized(path.dentry, path.mnt) &&
3506 + (__mnt_is_readonly(path.mnt))))
3507 res = -EROFS;
3508
3509 out_path_release:
3510 @@ -554,20 +528,19 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd
3511 error = -EBADF;
3512 file = fget(fd);
3513 if (!file)
3514 - goto out;
3515 + return error;
3516
3517 inode = file->f_path.dentry->d_inode;
3518
3519 error = -ENOTDIR;
3520 if (!S_ISDIR(inode->i_mode))
3521 - goto out_putf;
3522 + goto out;
3523
3524 error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
3525 if (!error)
3526 set_fs_pwd(current->fs, &file->f_path);
3527 -out_putf:
3528 - fput(file);
3529 out:
3530 + fput(file);
3531 return error;
3532 }
3533
3534 --- a/fs/readdir.c
3535 +++ b/fs/readdir.c
3536 @@ -16,6 +16,7 @@
3537 #include <linux/security.h>
3538 #include <linux/syscalls.h>
3539 #include <linux/unistd.h>
3540 +#include <linux/union.h>
3541
3542 #include <asm/uaccess.h>
3543
3544 @@ -36,9 +37,24 @@ int vfs_readdir(struct file *file, filld
3545
3546 res = -ENOENT;
3547 if (!IS_DEADDIR(inode)) {
3548 + /*
3549 + * XXX Think harder about locking for
3550 + * union_copyup_dir. Currently we lock the topmost
3551 + * directory and hold that lock while sequentially
3552 + * acquiring and dropping locks for the directories
3553 + * below this one in the union stack.
3554 + */
3555 + if (is_unionized(file->f_path.dentry, file->f_path.mnt) &&
3556 + !IS_OPAQUE(inode)) {
3557 + res = union_copyup_dir(&file->f_path);
3558 + if (res)
3559 + goto out_unlock;
3560 + }
3561 +
3562 res = file->f_op->readdir(file, buf, filler);
3563 file_accessed(file);
3564 }
3565 +out_unlock:
3566 mutex_unlock(&inode->i_mutex);
3567 out:
3568 return res;
3569 @@ -77,6 +93,9 @@ static int fillonedir(void * __buf, cons
3570 struct old_linux_dirent __user * dirent;
3571 unsigned long d_ino;
3572
3573 + if (d_type == DT_WHT)
3574 + return 0;
3575 +
3576 if (buf->result)
3577 return -EINVAL;
3578 d_ino = ino;
3579 @@ -154,6 +173,9 @@ static int filldir(void * __buf, const c
3580 unsigned long d_ino;
3581 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long));
3582
3583 + if (d_type == DT_WHT)
3584 + return 0;
3585 +
3586 buf->error = -EINVAL; /* only used if we fail.. */
3587 if (reclen > buf->count)
3588 return -EINVAL;
3589 @@ -239,6 +261,9 @@ static int filldir64(void * __buf, const
3590 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
3591 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64));
3592
3593 + if (d_type == DT_WHT)
3594 + return 0;
3595 +
3596 buf->error = -EINVAL; /* only used if we fail.. */
3597 if (reclen > buf->count)
3598 return -EINVAL;
3599 --- a/fs/super.c
3600 +++ b/fs/super.c
3601 @@ -596,6 +596,15 @@ int do_remount_sb(struct super_block *sb
3602 }
3603 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
3604
3605 + /* If we are remounting read/write, make sure that none of the
3606 + users require read-only for correct operation (such as
3607 + union mounts). */
3608 + if (remount_rw && sb->s_readonly_users) {
3609 + printk(KERN_INFO "%s: In use by %d read-only user(s)\n",
3610 + sb->s_id, sb->s_readonly_users);
3611 + return -EROFS;
3612 + }
3613 +
3614 if (sb->s_op->remount_fs) {
3615 retval = sb->s_op->remount_fs(sb, &flags, data);
3616 if (retval)
3617 @@ -953,6 +962,11 @@ vfs_kern_mount(struct file_system_type *
3618 WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
3619 "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
3620
3621 + error = -EROFS;
3622 + if (!(flags & MS_RDONLY) &&
3623 + (mnt->mnt_sb->s_readonly_users))
3624 + goto out_sb;
3625 +
3626 mnt->mnt_mountpoint = mnt->mnt_root;
3627 mnt->mnt_parent = mnt;
3628 up_write(&mnt->mnt_sb->s_umount);
3629 --- /dev/null
3630 +++ b/fs/union.c
3631 @@ -0,0 +1,981 @@
3632 +/*
3633 + * VFS based union mount for Linux
3634 + *
3635 + * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
3636 + * Copyright (C) 2007-2009 Novell Inc.
3637 + *
3638 + * Author(s): Jan Blunck (j.blunck@tu-harburg.de)
3639 + * Valerie Aurora <vaurora@redhat.com>
3640 + *
3641 + * This program is free software; you can redistribute it and/or modify it
3642 + * under the terms of the GNU General Public License as published by the Free
3643 + * Software Foundation; either version 2 of the License, or (at your option)
3644 + * any later version.
3645 + */
3646 +
3647 +#include <linux/bootmem.h>
3648 +#include <linux/init.h>
3649 +#include <linux/module.h>
3650 +#include <linux/types.h>
3651 +#include <linux/hash.h>
3652 +#include <linux/fs.h>
3653 +#include <linux/mount.h>
3654 +#include <linux/fs_struct.h>
3655 +#include <linux/union.h>
3656 +#include <linux/namei.h>
3657 +#include <linux/file.h>
3658 +#include <linux/mm.h>
3659 +#include <linux/quotaops.h>
3660 +#include <linux/dnotify.h>
3661 +#include <linux/security.h>
3662 +#include <linux/pipe_fs_i.h>
3663 +#include <linux/splice.h>
3664 +
3665 +/*
3666 + * This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
3667 + * should try to make this good - I've just made it work.
3668 + */
3669 +static unsigned int union_hash_mask __read_mostly;
3670 +static unsigned int union_hash_shift __read_mostly;
3671 +static struct hlist_head *union_hashtable __read_mostly;
3672 +static unsigned int union_rhash_mask __read_mostly;
3673 +static unsigned int union_rhash_shift __read_mostly;
3674 +static struct hlist_head *union_rhashtable __read_mostly;
3675 +
3676 +/*
3677 + * Locking Rules:
3678 + * - dcache_lock (for union_rlookup() only)
3679 + * - union_lock
3680 + */
3681 +DEFINE_SPINLOCK(union_lock);
3682 +
3683 +static struct kmem_cache *union_cache __read_mostly;
3684 +
3685 +static unsigned long hash(struct dentry *dentry, struct vfsmount *mnt)
3686 +{
3687 + unsigned long tmp;
3688 +
3689 + tmp = ((unsigned long)mnt * (unsigned long)dentry) ^
3690 + (GOLDEN_RATIO_PRIME + (unsigned long)mnt) / L1_CACHE_BYTES;
3691 + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> union_hash_shift);
3692 + return tmp & union_hash_mask;
3693 +}
3694 +
3695 +static __initdata unsigned long union_hash_entries;
3696 +
3697 +static int __init set_union_hash_entries(char *str)
3698 +{
3699 + if (!str)
3700 + return 0;
3701 + union_hash_entries = simple_strtoul(str, &str, 0);
3702 + return 1;
3703 +}
3704 +
3705 +__setup("union_hash_entries=", set_union_hash_entries);
3706 +
3707 +static int __init init_union(void)
3708 +{
3709 + int loop;
3710 +
3711 + union_cache = KMEM_CACHE(union_mount, SLAB_PANIC | SLAB_MEM_SPREAD);
3712 + union_hashtable = alloc_large_system_hash("Union-cache",
3713 + sizeof(struct hlist_head),
3714 + union_hash_entries,
3715 + 14,
3716 + 0,
3717 + &union_hash_shift,
3718 + &union_hash_mask,
3719 + 0);
3720 +
3721 + for (loop = 0; loop < (1 << union_hash_shift); loop++)
3722 + INIT_HLIST_HEAD(&union_hashtable[loop]);
3723 +
3724 +
3725 + union_rhashtable = alloc_large_system_hash("rUnion-cache",
3726 + sizeof(struct hlist_head),
3727 + union_hash_entries,
3728 + 14,
3729 + 0,
3730 + &union_rhash_shift,
3731 + &union_rhash_mask,
3732 + 0);
3733 +
3734 + for (loop = 0; loop < (1 << union_rhash_shift); loop++)
3735 + INIT_HLIST_HEAD(&union_rhashtable[loop]);
3736 +
3737 + return 0;
3738 +}
3739 +
3740 +fs_initcall(init_union);
3741 +
3742 +struct union_mount *union_alloc(struct dentry *this, struct vfsmount *this_mnt,
3743 + struct dentry *next, struct vfsmount *next_mnt)
3744 +{
3745 + struct union_mount *um;
3746 +
3747 + BUG_ON(!S_ISDIR(this->d_inode->i_mode));
3748 + BUG_ON(!S_ISDIR(next->d_inode->i_mode));
3749 +
3750 + um = kmem_cache_alloc(union_cache, GFP_ATOMIC);
3751 + if (!um)
3752 + return NULL;
3753 +
3754 + atomic_set(&um->u_count, 1);
3755 + INIT_LIST_HEAD(&um->u_unions);
3756 + INIT_LIST_HEAD(&um->u_list);
3757 + INIT_HLIST_NODE(&um->u_hash);
3758 + INIT_HLIST_NODE(&um->u_rhash);
3759 +
3760 + um->u_this.mnt = this_mnt;
3761 + um->u_this.dentry = this;
3762 + um->u_next.mnt = mntget(next_mnt);
3763 + um->u_next.dentry = dget(next);
3764 +
3765 + return um;
3766 +}
3767 +
3768 +struct union_mount *union_get(struct union_mount *um)
3769 +{
3770 + BUG_ON(!atomic_read(&um->u_count));
3771 + atomic_inc(&um->u_count);
3772 + return um;
3773 +}
3774 +
3775 +static int __union_put(struct union_mount *um)
3776 +{
3777 + if (!atomic_dec_and_test(&um->u_count))
3778 + return 0;
3779 +
3780 + BUG_ON(!hlist_unhashed(&um->u_hash));
3781 + BUG_ON(!hlist_unhashed(&um->u_rhash));
3782 +
3783 + kmem_cache_free(union_cache, um);
3784 + return 1;
3785 +}
3786 +
3787 +void union_put(struct union_mount *um)
3788 +{
3789 + struct path tmp = um->u_next;
3790 +
3791 + if (__union_put(um))
3792 + path_put(&tmp);
3793 +}
3794 +
3795 +static void __union_hash(struct union_mount *um)
3796 +{
3797 + hlist_add_head(&um->u_hash, union_hashtable +
3798 + hash(um->u_this.dentry, um->u_this.mnt));
3799 + hlist_add_head(&um->u_rhash, union_rhashtable +
3800 + hash(um->u_next.dentry, um->u_next.mnt));
3801 +}
3802 +
3803 +static void __union_unhash(struct union_mount *um)
3804 +{
3805 + hlist_del_init(&um->u_hash);
3806 + hlist_del_init(&um->u_rhash);
3807 +}
3808 +
3809 +struct union_mount *union_lookup(struct dentry *dentry, struct vfsmount *mnt)
3810 +{
3811 + struct hlist_head *head = union_hashtable + hash(dentry, mnt);
3812 + struct hlist_node *node;
3813 + struct union_mount *um;
3814 +
3815 + hlist_for_each_entry(um, node, head, u_hash) {
3816 + if ((um->u_this.dentry == dentry) &&
3817 + (um->u_this.mnt == mnt))
3818 + return um;
3819 + }
3820 +
3821 + return NULL;
3822 +}
3823 +
3824 +struct union_mount *union_rlookup(struct dentry *dentry, struct vfsmount *mnt)
3825 +{
3826 + struct hlist_head *head = union_rhashtable + hash(dentry, mnt);
3827 + struct hlist_node *node;
3828 + struct union_mount *um;
3829 +
3830 + hlist_for_each_entry(um, node, head, u_rhash) {
3831 + if ((um->u_next.dentry == dentry) &&
3832 + (um->u_next.mnt == mnt))
3833 + return um;
3834 + }
3835 +
3836 + return NULL;
3837 +}
3838 +
3839 +/*
3840 + * is_unionized - check if a dentry lives on a union mounted file system
3841 + *
3842 + * This tests if a dentry is living on an union mounted file system by walking
3843 + * the file system hierarchy.
3844 + */
3845 +int is_unionized(struct dentry *dentry, struct vfsmount *mnt)
3846 +{
3847 + struct path this = { .mnt = mntget(mnt),
3848 + .dentry = dget(dentry) };
3849 + struct vfsmount *tmp;
3850 +
3851 + do {
3852 + /* check if there is an union mounted on top of us */
3853 + spin_lock(&vfsmount_lock);
3854 + list_for_each_entry(tmp, &this.mnt->mnt_mounts, mnt_child) {
3855 + if (!(tmp->mnt_flags & MNT_UNION))
3856 + continue;
3857 + /* Isn't this a bug? */
3858 + if (this.dentry->d_sb != tmp->mnt_mountpoint->d_sb)
3859 + continue;
3860 + if (is_subdir(this.dentry, tmp->mnt_mountpoint)) {
3861 + spin_unlock(&vfsmount_lock);
3862 + path_put(&this);
3863 + return 1;
3864 + }
3865 + }
3866 + spin_unlock(&vfsmount_lock);
3867 +
3868 + /* check our mountpoint next */
3869 + tmp = mntget(this.mnt->mnt_parent);
3870 + dput(this.dentry);
3871 + this.dentry = dget(this.mnt->mnt_mountpoint);
3872 + mntput(this.mnt);
3873 + this.mnt = tmp;
3874 + } while (this.mnt != this.mnt->mnt_parent);
3875 +
3876 + path_put(&this);
3877 + return 0;
3878 +}
3879 +
3880 +int append_to_union(struct vfsmount *mnt, struct dentry *dentry,
3881 + struct vfsmount *dest_mnt, struct dentry *dest_dentry)
3882 +{
3883 + struct union_mount *this, *um;
3884 +
3885 + BUG_ON(!IS_MNT_UNION(mnt));
3886 +
3887 + this = union_alloc(dentry, mnt, dest_dentry, dest_mnt);
3888 + if (!this)
3889 + return -ENOMEM;
3890 +
3891 + spin_lock(&union_lock);
3892 + um = union_lookup(dentry, mnt);
3893 + if (um) {
3894 + BUG_ON((um->u_next.dentry != dest_dentry) ||
3895 + (um->u_next.mnt != dest_mnt));
3896 + spin_unlock(&union_lock);
3897 + union_put(this);
3898 + return 0;
3899 + }
3900 + list_add(&this->u_list, &mnt->mnt_unions);
3901 + list_add(&this->u_unions, &dentry->d_unions);
3902 + dest_dentry->d_unionized++;
3903 + __union_hash(this);
3904 + spin_unlock(&union_lock);
3905 + return 0;
3906 +}
3907 +
3908 +/*
3909 + * follow_union_down - follow the union stack one layer down
3910 + *
3911 + * This is called to traverse the union stack from one layer to the next
3912 + * overlayed one. follow_union_down() is called by various lookup functions
3913 + * that are aware of union mounts.
3914 + *
3915 + * Returns non-zero if followed to the next layer, zero otherwise.
3916 + */
3917 +int follow_union_down(struct path *path)
3918 +{
3919 + struct union_mount *um;
3920 +
3921 + if (!IS_MNT_UNION(path->mnt))
3922 + return 0;
3923 +
3924 + spin_lock(&union_lock);
3925 + um = union_lookup(path->dentry, path->mnt);
3926 + spin_unlock(&union_lock);
3927 + if (um) {
3928 + path_get(&um->u_next);
3929 + dput(path->dentry);
3930 + path->dentry = um->u_next.dentry;
3931 + mntput(path->mnt);
3932 + path->mnt = um->u_next.mnt;
3933 + return 1;
3934 + }
3935 + return 0;
3936 +}
3937 +
3938 +/*
3939 + * follow_union_mount - follow the union stack to the topmost layer
3940 + *
3941 + * This is called to traverse the union stack to the topmost layer. This is
3942 + * necessary for following parent pointers in an union mount.
3943 + *
3944 + * Returns none zero if followed to the topmost layer, zero otherwise.
3945 + */
3946 +int follow_union_mount(struct path *path)
3947 +{
3948 + struct union_mount *um;
3949 + int res = 0;
3950 +
3951 + while (IS_UNION(path->dentry)) {
3952 + spin_lock(&dcache_lock);
3953 + spin_lock(&union_lock);
3954 + um = union_rlookup(path->dentry, path->mnt);
3955 + if (um)
3956 + path_get(&um->u_this);
3957 + spin_unlock(&union_lock);
3958 + spin_unlock(&dcache_lock);
3959 +
3960 + /*
3961 + * Q: Aaargh, how do I validate the topmost dentry pointer?
3962 + * A: Eeeeasy! We took the dcache_lock and union_lock. Since
3963 + * this protects from any dput'ng going on, we know that the
3964 + * dentry is valid since the union is unhashed under
3965 + * dcache_lock too.
3966 + */
3967 + if (!um)
3968 + break;
3969 + dput(path->dentry);
3970 + path->dentry = um->u_this.dentry;
3971 + mntput(path->mnt);
3972 + path->mnt = um->u_this.mnt;
3973 + res = 1;
3974 + }
3975 +
3976 + return res;
3977 +}
3978 +
3979 +/*
3980 + * Union mount copyup support
3981 + */
3982 +
3983 +extern int hash_lookup_union(struct nameidata *, struct qstr *, struct path *);
3984 +extern void follow_mount(struct path *);
3985 +
3986 +/*
3987 + * union_relookup_topmost - lookup and create the topmost path to dentry
3988 + * @nd: pointer to nameidata
3989 + * @flags: lookup flags
3990 + */
3991 +static int union_relookup_topmost(struct nameidata *nd, int flags)
3992 +{
3993 + int err;
3994 + char *kbuf, *name;
3995 + struct nameidata this;
3996 +
3997 + kbuf = (char *)__get_free_page(GFP_KERNEL);
3998 + if (!kbuf)
3999 + return -ENOMEM;
4000 +
4001 + name = d_path(&nd->path, kbuf, PAGE_SIZE);
4002 + err = PTR_ERR(name);
4003 + if (IS_ERR(name))
4004 + goto free_page;
4005 +
4006 + err = path_lookup(name, flags|LOOKUP_CREATE|LOOKUP_TOPMOST, &this);
4007 + if (err)
4008 + goto free_page;
4009 +
4010 + path_put(&nd->path);
4011 + nd->path.dentry = this.path.dentry;
4012 + nd->path.mnt = this.path.mnt;
4013 +
4014 + /*
4015 + * the nd->flags should be unchanged
4016 + */
4017 + BUG_ON(this.um_flags & LAST_LOWLEVEL);
4018 + nd->um_flags &= ~LAST_LOWLEVEL;
4019 + free_page:
4020 + free_page((unsigned long)kbuf);
4021 + return err;
4022 +}
4023 +
4024 +static void __update_fs_pwd(struct path *path, struct dentry *dentry,
4025 + struct vfsmount *mnt)
4026 +{
4027 + struct path old = { NULL, NULL };
4028 +
4029 + write_lock(&current->fs->lock);
4030 + if (current->fs->pwd.dentry == path->dentry) {
4031 + old = current->fs->pwd;
4032 + path_get(&current->fs->pwd);
4033 + }
4034 + write_unlock(&current->fs->lock);
4035 +
4036 + if (old.dentry)
4037 + path_put(&old);
4038 +
4039 + return;
4040 +}
4041 +
4042 +/**
4043 + * union_permission - check for access rights to a given inode
4044 + * @inode: inode to check permission on
4045 + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
4046 + *
4047 + * In a union mount, the top layer is always read-write and the bottom
4048 + * is always read-only. Ignore the read-only flag on the lower fs.
4049 + *
4050 + * Only need for certain activities, like checking to see if write
4051 + * access is ok.
4052 + */
4053 +
4054 +int union_permission(struct path *path, int mask)
4055 +{
4056 + struct inode *inode = path->dentry->d_inode;
4057 +
4058 + if (!is_unionized(path->dentry, path->mnt))
4059 + return inode_permission(inode, mask);
4060 +
4061 + /* Tell __inode_permission to ignore MS_RDONLY */
4062 + return __inode_permission(inode, mask, 0);
4063 +}
4064 +
4065 +/*
4066 + * union_create_topmost - create the topmost path component
4067 + * @nd: pointer to nameidata of the base directory
4068 + * @name: pointer to file name
4069 + * @path: pointer to path of the overlaid file
4070 + *
4071 + * This is called by __link_path_walk() to create the directories on a path
4072 + * when it is called with LOOKUP_TOPMOST.
4073 + */
4074 +struct dentry *union_create_topmost(struct nameidata *nd, struct qstr *name,
4075 + struct path *path)
4076 +{
4077 + struct dentry *dentry, *parent = nd->path.dentry;
4078 + int res, mode = path->dentry->d_inode->i_mode;
4079 +
4080 + if (parent->d_sb == path->dentry->d_sb)
4081 + return ERR_PTR(-EEXIST);
4082 +
4083 + mutex_lock(&parent->d_inode->i_mutex);
4084 + dentry = lookup_one_len(name->name, nd->path.dentry, name->len);
4085 + if (IS_ERR(dentry))
4086 + goto out_unlock;
4087 +
4088 + switch (mode & S_IFMT) {
4089 + case S_IFREG:
4090 + /*
4091 + * FIXME: Does this make any sense in this case?
4092 + * Special case - lookup gave negative, but... we had foo/bar/
4093 + * From the vfs_mknod() POV we just have a negative dentry -
4094 + * all is fine. Let's be bastards - you had / on the end,you've
4095 + * been asking for (non-existent) directory. -ENOENT for you.
4096 + */
4097 + if (name->name[name->len] && !dentry->d_inode) {
4098 + dput(dentry);
4099 + dentry = ERR_PTR(-ENOENT);
4100 + goto out_unlock;
4101 + }
4102 +
4103 + res = vfs_create(parent->d_inode, dentry, mode, nd);
4104 + if (res) {
4105 + dput(dentry);
4106 + dentry = ERR_PTR(res);
4107 + goto out_unlock;
4108 + }
4109 + break;
4110 + case S_IFDIR:
4111 + res = vfs_mkdir(parent->d_inode, dentry, mode);
4112 + if (res) {
4113 + dput(dentry);
4114 + dentry = ERR_PTR(res);
4115 + goto out_unlock;
4116 + }
4117 +
4118 + res = append_to_union(nd->path.mnt, dentry, path->mnt,
4119 + path->dentry);
4120 + if (res) {
4121 + dput(dentry);
4122 + dentry = ERR_PTR(res);
4123 + goto out_unlock;
4124 + }
4125 + break;
4126 + default:
4127 + dput(dentry);
4128 + dentry = ERR_PTR(-EINVAL);
4129 + goto out_unlock;
4130 + }
4131 +
4132 + /* FIXME: Really necessary ??? */
4133 +/* __update_fs_pwd(path, dentry, nd->path.mnt); */
4134 +
4135 + out_unlock:
4136 + mutex_unlock(&parent->d_inode->i_mutex);
4137 + return dentry;
4138 +}
4139 +
4140 +static int union_copy_file(struct dentry *old_dentry, struct vfsmount *old_mnt,
4141 + struct dentry *new_dentry, struct vfsmount *new_mnt)
4142 +{
4143 + int ret;
4144 + size_t size;
4145 + loff_t offset;
4146 + struct file *old_file, *new_file;
4147 + const struct cred *cred = current_cred();
4148 +
4149 + dget(old_dentry);
4150 + mntget(old_mnt);
4151 + old_file = dentry_open(old_dentry, old_mnt, O_RDONLY, cred);
4152 + if (IS_ERR(old_file))
4153 + return PTR_ERR(old_file);
4154 +
4155 + dget(new_dentry);
4156 + mntget(new_mnt);
4157 + new_file = dentry_open(new_dentry, new_mnt, O_WRONLY, cred);
4158 + ret = PTR_ERR(new_file);
4159 + if (IS_ERR(new_file))
4160 + goto fput_old;
4161 +
4162 + /* XXX be smart by using a length param, which indicates max
4163 + * data we'll want (e.g., we are about to truncate to 0 or 10
4164 + * bytes or something */
4165 + size = i_size_read(old_file->f_path.dentry->d_inode);
4166 + if (((size_t)size != size) || ((ssize_t)size != size)) {
4167 + ret = -EFBIG;
4168 + goto fput_new;
4169 + }
4170 +
4171 + offset = 0;
4172 + ret = do_splice_direct(old_file, &offset, new_file, size,
4173 + SPLICE_F_MOVE);
4174 + if (ret >= 0)
4175 + ret = 0;
4176 + fput_new:
4177 + fput(new_file);
4178 + fput_old:
4179 + fput(old_file);
4180 + return ret;
4181 +}
4182 +
4183 +/**
4184 + * __union_copyup - copy a file to the topmost directory
4185 + * @old: pointer to path of the old file name
4186 + * @new_nd: pointer to nameidata of the topmost directory
4187 + * @new: pointer to path of the new file name
4188 + *
4189 + * The topmost directory @new_nd must already be locked. Creates the topmost
4190 + * file if it doesn't exist yet.
4191 + */
4192 +int __union_copyup(struct path *old, struct nameidata *new_nd,
4193 + struct path *new)
4194 +{
4195 + struct dentry *dentry;
4196 + int error;
4197 +
4198 + /* Maybe this should be -EINVAL */
4199 + if (S_ISDIR(old->dentry->d_inode->i_mode))
4200 + return -EISDIR;
4201 +
4202 + if (new_nd->path.dentry != new->dentry->d_parent) {
4203 + mutex_lock(&new_nd->path.dentry->d_inode->i_mutex);
4204 + dentry = lookup_one_len(new->dentry->d_name.name,
4205 + new_nd->path.dentry,
4206 + new->dentry->d_name.len);
4207 + mutex_unlock(&new_nd->path.dentry->d_inode->i_mutex);
4208 + if (IS_ERR(dentry))
4209 + return PTR_ERR(dentry);
4210 + error = -EEXIST;
4211 + if (dentry->d_inode)
4212 + goto out_dput;
4213 + } else
4214 + dentry = dget(new->dentry);
4215 +
4216 + if (!dentry->d_inode) {
4217 + error = vfs_create(new_nd->path.dentry->d_inode, dentry,
4218 + old->dentry->d_inode->i_mode, new_nd);
4219 + if (error)
4220 + goto out_dput;
4221 + }
4222 +
4223 + BUG_ON(!S_ISREG(old->dentry->d_inode->i_mode));
4224 + error = union_copy_file(old->dentry, old->mnt, dentry,
4225 + new_nd->path.mnt);
4226 + if (error) {
4227 + /* FIXME: are there return value we should not
4228 + * BUG() on ? */
4229 + BUG_ON(vfs_unlink(new_nd->path.dentry->d_inode,
4230 + dentry));
4231 + goto out_dput;
4232 + }
4233 +
4234 + dput(new->dentry);
4235 + new->dentry = dentry;
4236 + if (new->mnt != new_nd->path.mnt)
4237 + mntput(new->mnt);
4238 + new->mnt = new_nd->path.mnt;
4239 + return error;
4240 +
4241 +out_dput:
4242 + dput(dentry);
4243 + return error;
4244 +}
4245 +
4246 +/*
4247 + * union_copyup - copy a file to the topmost layer of the union stack
4248 + * @nd: nameidata pointer to the file
4249 + * @flags: flags given to open_namei
4250 + */
4251 +int union_copyup(struct nameidata *nd, int flags /* XXX not used */)
4252 +{
4253 + struct qstr this;
4254 + char *name;
4255 + struct dentry *dir;
4256 + struct path path;
4257 + int err;
4258 +
4259 + if (!is_unionized(nd->path.dentry, nd->path.mnt))
4260 + return 0;
4261 + if (!S_ISREG(nd->path.dentry->d_inode->i_mode))
4262 + return 0;
4263 +
4264 + /* safe the name for hash_lookup_union() */
4265 + this.len = nd->path.dentry->d_name.len;
4266 + this.hash = nd->path.dentry->d_name.hash;
4267 + name = kmalloc(this.len + 1, GFP_KERNEL);
4268 + if (!name)
4269 + return -ENOMEM;
4270 + this.name = name;
4271 + memcpy(name, nd->path.dentry->d_name.name, nd->path.dentry->d_name.len);
4272 + name[this.len] = 0;
4273 +
4274 + err = union_relookup_topmost(nd, nd->flags|LOOKUP_PARENT);
4275 + if (err) {
4276 + kfree(name);
4277 + return err;
4278 + }
4279 + nd->flags &= ~LOOKUP_PARENT;
4280 +
4281 + dir = nd->path.dentry;
4282 + mutex_lock(&dir->d_inode->i_mutex);
4283 + err = hash_lookup_union(nd, &this, &path);
4284 + mutex_unlock(&dir->d_inode->i_mutex);
4285 + kfree(name);
4286 + if (err)
4287 + return err;
4288 +
4289 + err = -ENOENT;
4290 + if (!path.dentry->d_inode)
4291 + goto exit_dput;
4292 +
4293 + /* Necessary?! I guess not ... */
4294 + follow_mount(&path);
4295 +
4296 + err = -ENOENT;
4297 + if (!path.dentry->d_inode)
4298 + goto exit_dput;
4299 +
4300 + err = -EISDIR;
4301 + if (!S_ISREG(path.dentry->d_inode->i_mode))
4302 + goto exit_dput;
4303 +
4304 + if (path.dentry->d_parent != nd->path.dentry) {
4305 + err = __union_copyup(&path, nd, &path);
4306 + if (err)
4307 + goto exit_dput;
4308 + }
4309 +
4310 + dput(nd->path.dentry);
4311 + if (nd->path.mnt != path.mnt)
4312 + mntput(nd->path.mnt);
4313 + nd->path = path;
4314 + return 0;
4315 +
4316 +exit_dput:
4317 + dput(path.dentry);
4318 + if (path.mnt != nd->path.mnt)
4319 + mntput(path.mnt);
4320 + return err;
4321 +}
4322 +
4323 +/*
4324 + * This must be called when unhashing a dentry. This is called with dcache_lock
4325 + * and unhashes all unions this dentry is in.
4326 + */
4327 +void __d_drop_unions(struct dentry *dentry)
4328 +{
4329 + struct union_mount *this, *next;
4330 +
4331 + spin_lock(&union_lock);
4332 + list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions)
4333 + __union_unhash(this);
4334 + spin_unlock(&union_lock);
4335 +}
4336 +EXPORT_SYMBOL_GPL(__d_drop_unions);
4337 +
4338 +/*
4339 + * This must be called after __d_drop_unions() without holding any locks.
4340 + * Note: The dentry might still be reachable via a lookup but at that time it
4341 + * already a negative dentry. Otherwise it would be unhashed. The union_mount
4342 + * structure itself is still reachable through mnt->mnt_unions (which we
4343 + * protect against with union_lock).
4344 + */
4345 +void shrink_d_unions(struct dentry *dentry)
4346 +{
4347 + struct union_mount *this, *next;
4348 +
4349 +repeat:
4350 + spin_lock(&union_lock);
4351 + list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions) {
4352 + BUG_ON(!hlist_unhashed(&this->u_hash));
4353 + BUG_ON(!hlist_unhashed(&this->u_rhash));
4354 + list_del(&this->u_list);
4355 + list_del(&this->u_unions);
4356 + this->u_next.dentry->d_unionized--;
4357 + spin_unlock(&union_lock);
4358 + union_put(this);
4359 + goto repeat;
4360 + }
4361 + spin_unlock(&union_lock);
4362 +}
4363 +
4364 +extern void __dput(struct dentry *, struct list_head *, int);
4365 +
4366 +/*
4367 + * This is the special variant for use in dput() only.
4368 + */
4369 +void __shrink_d_unions(struct dentry *dentry, struct list_head *list)
4370 +{
4371 + struct union_mount *this, *next;
4372 +
4373 + BUG_ON(!d_unhashed(dentry));
4374 +
4375 +repeat:
4376 + spin_lock(&union_lock);
4377 + list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions) {
4378 + struct dentry *n_dentry = this->u_next.dentry;
4379 + struct vfsmount *n_mnt = this->u_next.mnt;
4380 +
4381 + BUG_ON(!hlist_unhashed(&this->u_hash));
4382 + BUG_ON(!hlist_unhashed(&this->u_rhash));
4383 + list_del(&this->u_list);
4384 + list_del(&this->u_unions);
4385 + this->u_next.dentry->d_unionized--;
4386 + spin_unlock(&union_lock);
4387 + if (__union_put(this)) {
4388 + __dput(n_dentry, list, 0);
4389 + mntput(n_mnt);
4390 + }
4391 + goto repeat;
4392 + }
4393 + spin_unlock(&union_lock);
4394 +}
4395 +
4396 +/*
4397 + * Remove all union_mounts structures belonging to this vfsmount from the
4398 + * union lookup hashtable and so on ...
4399 + */
4400 +void shrink_mnt_unions(struct vfsmount *mnt)
4401 +{
4402 + struct union_mount *this, *next;
4403 +
4404 +repeat:
4405 + spin_lock(&union_lock);
4406 + list_for_each_entry_safe(this, next, &mnt->mnt_unions, u_list) {
4407 + if (this->u_this.dentry == mnt->mnt_root)
4408 + continue;
4409 + __union_unhash(this);
4410 + list_del(&this->u_list);
4411 + list_del(&this->u_unions);
4412 + this->u_next.dentry->d_unionized--;
4413 + spin_unlock(&union_lock);
4414 + union_put(this);
4415 + goto repeat;
4416 + }
4417 + spin_unlock(&union_lock);
4418 +}
4419 +
4420 +int attach_mnt_union(struct vfsmount *mnt, struct vfsmount *dest_mnt,
4421 + struct dentry *dest_dentry)
4422 +{
4423 + if (!IS_MNT_UNION(mnt))
4424 + return 0;
4425 +
4426 + return append_to_union(mnt, mnt->mnt_root, dest_mnt, dest_dentry);
4427 +}
4428 +
4429 +void detach_mnt_union(struct vfsmount *mnt)
4430 +{
4431 + struct union_mount *um;
4432 +
4433 + if (!IS_MNT_UNION(mnt))
4434 + return;
4435 +
4436 + shrink_mnt_unions(mnt);
4437 +
4438 + spin_lock(&union_lock);
4439 + um = union_lookup(mnt->mnt_root, mnt);
4440 + __union_unhash(um);
4441 + list_del(&um->u_list);
4442 + list_del(&um->u_unions);
4443 + um->u_next.dentry->d_unionized--;
4444 + spin_unlock(&union_lock);
4445 + union_put(um);
4446 + return;
4447 +}
4448 +
4449 +/**
4450 + * union_copyup_dir_one - copy up a single directory entry
4451 + *
4452 + * Individual directory entry copyup function for union_copyup_dir.
4453 + * We get the entries from higher level layers first.
4454 + */
4455 +
4456 +static int union_copyup_dir_one(void *buf, const char *name, int namlen,
4457 + loff_t offset, u64 ino, unsigned int d_type)
4458 +{
4459 + struct dentry *topmost_dentry = (struct dentry *) buf;
4460 + struct dentry *dentry;
4461 + int err = 0;
4462 +
4463 + switch (namlen) {
4464 + case 2:
4465 + if (name[1] != '.')
4466 + break;
4467 + case 1:
4468 + if (name[0] != '.')
4469 + break;
4470 + return 0;
4471 + }
4472 +
4473 + /* Lookup this entry in the topmost directory */
4474 + dentry = lookup_one_len(name, topmost_dentry, namlen);
4475 +
4476 + if (IS_ERR(dentry)) {
4477 + printk(KERN_INFO "error looking up %s\n", dentry->d_name.name);
4478 + goto out;
4479 + }
4480 +
4481 + /*
4482 + * If the entry already exists, one of the following is true:
4483 + * it was already copied up (due to an earlier lookup), an
4484 + * entry with the same name already exists on the topmost file
4485 + * system, it is a whiteout, or it is a fallthru. In each
4486 + * case, the top level entry masks any entries from lower file
4487 + * systems, so don't copy up this entry.
4488 + */
4489 + if (dentry->d_inode || d_is_whiteout(dentry) ||
4490 + d_is_fallthru(dentry)) {
4491 + printk(KERN_INFO "skipping copy of %s\n", dentry->d_name.name);
4492 + goto out_dput;
4493 + }
4494 +
4495 + /*
4496 + * If the entry doesn't exist, create a fallthru entry in the
4497 + * topmost file system. All possible directory types are
4498 + * used, so each file system must implement its own way of
4499 + * storing a fallthru entry.
4500 + */
4501 + printk(KERN_INFO "creating fallthru for %s\n", dentry->d_name.name);
4502 + err = topmost_dentry->d_inode->i_op->fallthru(topmost_dentry->d_inode,
4503 + dentry);
4504 + /* FIXME */
4505 + BUG_ON(err);
4506 + /*
4507 + * At this point, we have a negative dentry marked as fallthru
4508 + * in the cache. We could potentially lookup the entry lower
4509 + * level file system and turn this into a positive dentry
4510 + * right now, but it is not clear that would be a performance
4511 + * win and adds more opportunities to fail.
4512 + */
4513 +out_dput:
4514 + dput(dentry);
4515 +out:
4516 + return 0;
4517 +}
4518 +
4519 +/**
4520 + * union_copyup_dir - copy up low-level directory entries to topmost dir
4521 + *
4522 + * readdir() is difficult to support on union file systems for two
4523 + * reasons: We must eliminate duplicates and apply whiteouts, and we
4524 + * must return something in f_pos that lets us restart in the same
4525 + * place when we return. Our solution is to, on first readdir() of
4526 + * the directory, copy up all visible entries from the low-level file
4527 + * systems and mark the entries that refer to low-level file system
4528 + * objects as "fallthru" entries.
4529 + */
4530 +
4531 +int union_copyup_dir(struct path *topmost_path)
4532 +{
4533 + struct dentry *topmost_dentry = topmost_path->dentry;
4534 + struct path path = *topmost_path;
4535 + int res = 0;
4536 +
4537 + /*
4538 + * Skip opaque dirs.
4539 + */
4540 + if (IS_OPAQUE(topmost_dentry->d_inode))
4541 + return 0;
4542 +
4543 + /*
4544 + * Mark this dir opaque to show that we have already copied up
4545 + * the lower entries. Only fallthru entries pass through to
4546 + * the underlying file system.
4547 + *
4548 + * XXX Deal with the lower file system changing. This could
4549 + * be through running a tool over the top level file system to
4550 + * make directories transparent again, or we could check the
4551 + * mtime of the underlying directory.
4552 + */
4553 +
4554 + topmost_dentry->d_inode->i_flags |= S_OPAQUE;
4555 + mark_inode_dirty(topmost_dentry->d_inode);
4556 +
4557 + /*
4558 + * Loop through each dir on each level copying up the entries
4559 + * to the topmost.
4560 + */
4561 +
4562 + /* Don't drop the caller's reference to the topmost path */
4563 + path_get(&path);
4564 + while (follow_union_down(&path)) {
4565 + struct file * ftmp;
4566 + struct inode * inode;
4567 +
4568 + /* XXX Permit fallthrus on lower-level? Would need to
4569 + * pass in opaque flag to union_copyup_dir_one() and
4570 + * only copy up fallthru entries there. We allow
4571 + * fallthrus in lower level opaque directories on
4572 + * lookup, so for consistency we should do one or the
4573 + * other in both places. */
4574 + if (IS_OPAQUE(path.dentry->d_inode))
4575 + break;
4576 +
4577 + /* dentry_open() doesn't get a path reference itself */
4578 + path_get(&path);
4579 + ftmp = dentry_open(path.dentry, path.mnt,
4580 + O_RDONLY | O_DIRECTORY | O_NOATIME,
4581 + current_cred());
4582 + if (IS_ERR(ftmp)) {
4583 + printk (KERN_ERR "unable to open dir %s for "
4584 + "directory copyup: %ld\n",
4585 + path.dentry->d_name.name, PTR_ERR(ftmp));
4586 + continue;
4587 + }
4588 +
4589 + inode = path.dentry->d_inode;
4590 + mutex_lock(&inode->i_mutex);
4591 +
4592 + res = -ENOENT;
4593 + if (IS_DEADDIR(inode))
4594 + goto out_fput;
4595 + /*
4596 + * Read the whole directory, calling our directory
4597 + * entry copyup function on each entry. Pass in the
4598 + * topmost dentry as our private data so we can create
4599 + * new entries in the topmost directory.
4600 + */
4601 + res = ftmp->f_op->readdir(ftmp, topmost_dentry,
4602 + union_copyup_dir_one);
4603 +out_fput:
4604 + mutex_unlock(&inode->i_mutex);
4605 + fput(ftmp);
4606 +
4607 + if (res)
4608 + break;
4609 + }
4610 + path_put(&path);
4611 + return res;
4612 +}
4613 --- a/include/linux/dcache.h
4614 +++ b/include/linux/dcache.h
4615 @@ -101,6 +101,15 @@ struct dentry {
4616 struct dentry *d_parent; /* parent directory */
4617 struct qstr d_name;
4618
4619 +#ifdef CONFIG_UNION_MOUNT
4620 + /*
4621 + * The following fields are used by the VFS based union mount
4622 + * implementation. Both are protected by union_lock!
4623 + */
4624 + struct list_head d_unions; /* list of union_mount's */
4625 + unsigned int d_unionized; /* unions referencing this dentry */
4626 +#endif
4627 +
4628 struct list_head d_lru; /* LRU list */
4629 /*
4630 * d_child and d_rcu can share memory
4631 @@ -186,6 +195,9 @@ d_iput: no no no yes
4632
4633 #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
4634
4635 +#define DCACHE_WHITEOUT 0x0100 /* This negative dentry is a whiteout */
4636 +#define DCACHE_FALLTHRU 0x0200 /* Keep looking in the file system below */
4637 +
4638 extern spinlock_t dcache_lock;
4639 extern seqlock_t rename_lock;
4640
4641 @@ -205,12 +217,20 @@ extern seqlock_t rename_lock;
4642 * __d_drop requires dentry->d_lock.
4643 */
4644
4645 +#ifdef CONFIG_UNION_MOUNT
4646 +extern void __d_drop_unions(struct dentry *);
4647 +#endif
4648 +
4649 static inline void __d_drop(struct dentry *dentry)
4650 {
4651 if (!(dentry->d_flags & DCACHE_UNHASHED)) {
4652 dentry->d_flags |= DCACHE_UNHASHED;
4653 hlist_del_rcu(&dentry->d_hash);
4654 }
4655 +#ifdef CONFIG_UNION_MOUNT
4656 + /* remove dentry from the union hashtable */
4657 + __d_drop_unions(dentry);
4658 +#endif
4659 }
4660
4661 static inline void d_drop(struct dentry *dentry)
4662 @@ -358,6 +378,16 @@ static inline int d_unlinked(struct dent
4663 return d_unhashed(dentry) && !IS_ROOT(dentry);
4664 }
4665
4666 +static inline int d_is_whiteout(struct dentry *dentry)
4667 +{
4668 + return (dentry->d_flags & DCACHE_WHITEOUT);
4669 +}
4670 +
4671 +static inline int d_is_fallthru(struct dentry *dentry)
4672 +{
4673 + return (dentry->d_flags & DCACHE_FALLTHRU);
4674 +}
4675 +
4676 static inline struct dentry *dget_parent(struct dentry *dentry)
4677 {
4678 struct dentry *ret;
4679 --- a/include/linux/ext2_fs.h
4680 +++ b/include/linux/ext2_fs.h
4681 @@ -189,6 +189,7 @@ struct ext2_group_desc
4682 #define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
4683 #define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
4684 #define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
4685 +#define EXT2_OPAQUE_FL 0x00040000
4686 #define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
4687
4688 #define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
4689 @@ -503,10 +504,12 @@ struct ext2_super_block {
4690 #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004
4691 #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
4692 #define EXT2_FEATURE_INCOMPAT_META_BG 0x0010
4693 +#define EXT2_FEATURE_INCOMPAT_WHITEOUT 0x0020
4694 #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff
4695
4696 #define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
4697 #define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \
4698 + EXT2_FEATURE_INCOMPAT_WHITEOUT| \
4699 EXT2_FEATURE_INCOMPAT_META_BG)
4700 #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
4701 EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
4702 @@ -573,6 +576,8 @@ enum {
4703 EXT2_FT_FIFO,
4704 EXT2_FT_SOCK,
4705 EXT2_FT_SYMLINK,
4706 + EXT2_FT_WHT,
4707 + EXT2_FT_FALLTHRU,
4708 EXT2_FT_MAX
4709 };
4710
4711 --- a/include/linux/fs.h
4712 +++ b/include/linux/fs.h
4713 @@ -188,6 +188,7 @@ struct inodes_stat_t {
4714 #define MS_REMOUNT 32 /* Alter flags of a mounted FS */
4715 #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
4716 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */
4717 +#define MS_UNION 256
4718 #define MS_NOATIME 1024 /* Do not update access times. */
4719 #define MS_NODIRATIME 2048 /* Do not update directory access times */
4720 #define MS_BIND 4096
4721 @@ -205,6 +206,7 @@ struct inodes_stat_t {
4722 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
4723 #define MS_I_VERSION (1<<23) /* Update inode I_version field */
4724 #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
4725 +#define MS_WHITEOUT (1<<26) /* fs does support white-out filetype */
4726 #define MS_ACTIVE (1<<30)
4727 #define MS_NOUSER (1<<31)
4728
4729 @@ -231,6 +233,7 @@ struct inodes_stat_t {
4730 #define S_NOCMTIME 128 /* Do not update file c/mtime */
4731 #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
4732 #define S_PRIVATE 512 /* Inode is fs-internal */
4733 +#define S_OPAQUE 1024 /* Directory is opaque */
4734
4735 /*
4736 * Note that nosuid etc flags are inode-specific: setting some file-system
4737 @@ -266,6 +269,8 @@ struct inodes_stat_t {
4738 #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
4739 #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
4740
4741 +#define IS_OPAQUE(inode) ((inode)->i_flags & S_OPAQUE)
4742 +
4743 /* the read-only stuff doesn't really belong here, but any other place is
4744 probably as bad and I don't want to create yet another include file. */
4745
4746 @@ -1380,6 +1385,11 @@ struct super_block {
4747 * generic_show_options()
4748 */
4749 char *s_options;
4750 +
4751 + /*
4752 + * Users who require read-only access - e.g., union mounts
4753 + */
4754 + int s_readonly_users;
4755 };
4756
4757 extern struct timespec current_fs_time(struct super_block *sb);
4758 @@ -1517,6 +1527,8 @@ struct inode_operations {
4759 int (*mkdir) (struct inode *,struct dentry *,int);
4760 int (*rmdir) (struct inode *,struct dentry *);
4761 int (*mknod) (struct inode *,struct dentry *,int,dev_t);
4762 + int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
4763 + int (*fallthru) (struct inode *, struct dentry *);
4764 int (*rename) (struct inode *, struct dentry *,
4765 struct inode *, struct dentry *);
4766 int (*readlink) (struct dentry *, char __user *,int);
4767 @@ -2108,6 +2120,7 @@ extern void emergency_remount(void);
4768 extern sector_t bmap(struct inode *, sector_t);
4769 #endif
4770 extern int notify_change(struct dentry *, struct iattr *);
4771 +extern int __inode_permission(struct inode *inode, int mask, int rofs);
4772 extern int inode_permission(struct inode *, int);
4773 extern int generic_permission(struct inode *, int,
4774 int (*check_acl)(struct inode *, int));
4775 @@ -2135,7 +2148,7 @@ extern void free_write_pipe(struct file
4776
4777 extern struct file *do_filp_open(int dfd, const char *pathname,
4778 int open_flag, int mode, int acc_mode);
4779 -extern int may_open(struct path *, int, int);
4780 +extern int may_open(struct nameidata *, int, int);
4781
4782 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
4783 extern struct file * open_exec(const char *);
4784 --- a/include/linux/mount.h
4785 +++ b/include/linux/mount.h
4786 @@ -35,6 +35,7 @@ struct mnt_namespace;
4787 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
4788 #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
4789 #define MNT_PNODE_MASK 0x3000 /* propagation flag mask */
4790 +#define MNT_UNION 0x4000 /* if the vfsmount is a union mount */
4791
4792 struct vfsmount {
4793 struct list_head mnt_hash;
4794 @@ -53,6 +54,9 @@ struct vfsmount {
4795 struct list_head mnt_slave_list;/* list of slave mounts */
4796 struct list_head mnt_slave; /* slave list entry */
4797 struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
4798 +#ifdef CONFIG_UNION_MOUNT
4799 + struct list_head mnt_unions; /* list of union_mount structures */
4800 +#endif
4801 struct mnt_namespace *mnt_ns; /* containing namespace */
4802 int mnt_id; /* mount identifier */
4803 int mnt_group_id; /* peer group identifier */
4804 --- a/include/linux/namei.h
4805 +++ b/include/linux/namei.h
4806 @@ -20,6 +20,7 @@ struct nameidata {
4807 struct qstr last;
4808 struct path root;
4809 unsigned int flags;
4810 + unsigned int um_flags;
4811 int last_type;
4812 unsigned depth;
4813 char *saved_names[MAX_NESTED_LINKS + 1];
4814 @@ -35,6 +36,9 @@ struct nameidata {
4815 */
4816 enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
4817
4818 +#define LAST_UNION 0x01
4819 +#define LAST_LOWLEVEL 0x02
4820 +
4821 /*
4822 * The bitmask for a lookup event:
4823 * - follow links at the end
4824 @@ -49,6 +53,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
4825 #define LOOKUP_CONTINUE 4
4826 #define LOOKUP_PARENT 16
4827 #define LOOKUP_REVAL 64
4828 +#define LOOKUP_TOPMOST 128
4829 +
4830 /*
4831 * Intent data
4832 */
4833 --- /dev/null
4834 +++ b/include/linux/union.h
4835 @@ -0,0 +1,84 @@
4836 +/*
4837 + * VFS based union mount for Linux
4838 + *
4839 + * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
4840 + * Copyright (C) 2007 Novell Inc.
4841 + * Author(s): Jan Blunck (j.blunck@tu-harburg.de)
4842 + *
4843 + * This program is free software; you can redistribute it and/or modify it
4844 + * under the terms of the GNU General Public License as published by the Free
4845 + * Software Foundation; either version 2 of the License, or (at your option)
4846 + * any later version.
4847 + *
4848 + */
4849 +#ifndef __LINUX_UNION_H
4850 +#define __LINUX_UNION_H
4851 +#ifdef __KERNEL__
4852 +
4853 +#include <linux/list.h>
4854 +#include <asm/atomic.h>
4855 +
4856 +struct dentry;
4857 +struct vfsmount;
4858 +
4859 +#ifdef CONFIG_UNION_MOUNT
4860 +
4861 +/*
4862 + * The new union mount structure.
4863 + */
4864 +struct union_mount {
4865 + atomic_t u_count; /* reference count */
4866 + struct mutex u_mutex;
4867 + struct list_head u_unions; /* list head for d_unions */
4868 + struct list_head u_list; /* list head for mnt_unions */
4869 + struct hlist_node u_hash; /* list head for seaching */
4870 + struct hlist_node u_rhash; /* list head for reverse seaching */
4871 +
4872 + struct path u_this; /* this is me */
4873 + struct path u_next; /* this is what I overlay */
4874 +};
4875 +
4876 +#define IS_UNION(dentry) (!list_empty(&(dentry)->d_unions) || \
4877 + (dentry)->d_unionized)
4878 +#define IS_MNT_UNION(mnt) ((mnt)->mnt_flags & MNT_UNION)
4879 +
4880 +extern int is_unionized(struct dentry *, struct vfsmount *);
4881 +extern int append_to_union(struct vfsmount *, struct dentry *,
4882 + struct vfsmount *, struct dentry *);
4883 +extern int follow_union_down(struct path *);
4884 +extern int follow_union_mount(struct path *);
4885 +extern void __d_drop_unions(struct dentry *);
4886 +extern void shrink_d_unions(struct dentry *);
4887 +extern void __shrink_d_unions(struct dentry *, struct list_head *);
4888 +extern int attach_mnt_union(struct vfsmount *, struct vfsmount *,
4889 + struct dentry *);
4890 +extern void detach_mnt_union(struct vfsmount *);
4891 +extern struct dentry *union_create_topmost(struct nameidata *, struct qstr *,
4892 + struct path *);
4893 +extern int __union_copyup(struct path *, struct nameidata *, struct path *);
4894 +extern int union_copyup(struct nameidata *, int);
4895 +extern int union_copyup_dir(struct path *path);
4896 +extern int union_permission(struct path *, int);
4897 +
4898 +#else /* CONFIG_UNION_MOUNT */
4899 +
4900 +#define IS_UNION(x) (0)
4901 +#define IS_MNT_UNION(x) (0)
4902 +#define is_unionized(x, y) (0)
4903 +#define append_to_union(x1, y1, x2, y2) ({ BUG(); (0); })
4904 +#define follow_union_down(x) ({ (0); })
4905 +#define follow_union_mount(x) ({ (0); })
4906 +#define __d_drop_unions(x) do { } while (0)
4907 +#define shrink_d_unions(x) do { } while (0)
4908 +#define __shrink_d_unions(x,y) do { } while (0)
4909 +#define attach_mnt_union(x, y, z) do { } while (0)
4910 +#define detach_mnt_union(x) do { } while (0)
4911 +#define union_create_topmost(x, y, z) ({ BUG(); (NULL); })
4912 +#define __union_copyup(x, y, z) ({ BUG(); (0); })
4913 +#define union_copyup(x, y) ({ (0); })
4914 +#define union_copyup_dir(x) ({ BUG(); (0); })
4915 +#define union_permission(x, y) inode_permission(x->dentry->d_inode, y)
4916 +
4917 +#endif /* CONFIG_UNION_MOUNT */
4918 +#endif /* __KERNEL__ */
4919 +#endif /* __LINUX_UNION_H */
4920 --- a/mm/shmem.c
4921 +++ b/mm/shmem.c
4922 @@ -1798,6 +1798,118 @@ static int shmem_statfs(struct dentry *d
4923 return 0;
4924 }
4925
4926 +static int shmem_rmdir(struct inode *dir, struct dentry *dentry);
4927 +static int shmem_unlink(struct inode *dir, struct dentry *dentry);
4928 +
4929 +/*
4930 + * Create a dentry to signify a whiteout.
4931 + */
4932 +static int shmem_whiteout(struct inode *dir, struct dentry *old_dentry,
4933 + struct dentry *new_dentry)
4934 +{
4935 + struct shmem_sb_info *sbinfo = SHMEM_SB(dir->i_sb);
4936 + struct dentry *dentry;
4937 +
4938 + if (!(dir->i_sb->s_flags & MS_WHITEOUT))
4939 + return -EPERM;
4940 +
4941 + /* This gives us a proper initialized negative dentry */
4942 + dentry = simple_lookup(dir, new_dentry, NULL);
4943 + if (dentry && IS_ERR(dentry))
4944 + return PTR_ERR(dentry);
4945 +
4946 + /*
4947 + * No ordinary (disk based) filesystem counts whiteouts as inodes;
4948 + * but each new link needs a new dentry, pinning lowmem, and
4949 + * tmpfs dentries cannot be pruned until they are unlinked.
4950 + */
4951 + if (sbinfo->max_inodes) {
4952 + spin_lock(&sbinfo->stat_lock);
4953 + if (!sbinfo->free_inodes) {
4954 + spin_unlock(&sbinfo->stat_lock);
4955 + return -ENOSPC;
4956 + }
4957 + sbinfo->free_inodes--;
4958 + spin_unlock(&sbinfo->stat_lock);
4959 + }
4960 +
4961 + if (old_dentry->d_inode || d_is_fallthru(old_dentry)) {
4962 + if (old_dentry->d_inode && S_ISDIR(old_dentry->d_inode->i_mode))
4963 + shmem_rmdir(dir, old_dentry);
4964 + else
4965 + shmem_unlink(dir, old_dentry);
4966 + }
4967 +
4968 + dir->i_size += BOGO_DIRENT_SIZE;
4969 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
4970 + /* Extra pinning count for the created dentry */
4971 + dget(new_dentry);
4972 + spin_lock(&new_dentry->d_lock);
4973 + new_dentry->d_flags |= DCACHE_WHITEOUT;
4974 + spin_unlock(&new_dentry->d_lock);
4975 + return 0;
4976 +}
4977 +
4978 +static void shmem_d_instantiate(struct inode *dir, struct dentry *dentry,
4979 + struct inode *inode);
4980 +
4981 +/*
4982 + * Create a dentry to signify a fallthru. A fallthru lets us read the
4983 + * low-level dentries into the dcache once on the first readdir() and
4984 + * then
4985 + */
4986 +static int shmem_fallthru(struct inode *dir, struct dentry *dentry)
4987 +{
4988 + struct shmem_sb_info *sbinfo = SHMEM_SB(dir->i_sb);
4989 +
4990 + /* FIXME: this is stupid */
4991 + if (!(dir->i_sb->s_flags & MS_WHITEOUT))
4992 + return -EPERM;
4993 +
4994 + if (dentry->d_inode || d_is_fallthru(dentry) || d_is_whiteout(dentry))
4995 + return -EEXIST;
4996 +
4997 + /*
4998 + * Each new link needs a new dentry, pinning lowmem, and tmpfs
4999 + * dentries cannot be pruned until they are unlinked.
5000 + */
5001 + if (sbinfo->max_inodes) {
5002 + spin_lock(&sbinfo->stat_lock);
5003 + if (!sbinfo->free_inodes) {
5004 + spin_unlock(&sbinfo->stat_lock);
5005 + return -ENOSPC;
5006 + }
5007 + sbinfo->free_inodes--;
5008 + spin_unlock(&sbinfo->stat_lock);
5009 + }
5010 +
5011 + shmem_d_instantiate(dir, dentry, NULL);
5012 + dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5013 +
5014 + spin_lock(&dentry->d_lock);
5015 + dentry->d_flags |= DCACHE_FALLTHRU;
5016 + spin_unlock(&dentry->d_lock);
5017 + return 0;
5018 +}
5019 +
5020 +static void shmem_d_instantiate(struct inode *dir, struct dentry *dentry,
5021 + struct inode *inode)
5022 +{
5023 + if (d_is_whiteout(dentry)) {
5024 + /* Re-using an existing whiteout */
5025 + shmem_free_inode(dir->i_sb);
5026 + if (S_ISDIR(inode->i_mode))
5027 + inode->i_mode |= S_OPAQUE;
5028 + } else if (d_is_fallthru(dentry)) {
5029 + shmem_free_inode(dir->i_sb);
5030 + } else {
5031 + /* New dentry */
5032 + dir->i_size += BOGO_DIRENT_SIZE;
5033 + dget(dentry); /* Extra count - pin the dentry in core */
5034 + }
5035 + /* Will clear DCACHE_WHITEOUT and DCACHE_FALLTHRU flags */
5036 + d_instantiate(dentry, inode);
5037 +}
5038 /*
5039 * File creation. Allocate an inode, and we're done..
5040 */
5041 @@ -1822,15 +1934,16 @@ shmem_mknod(struct inode *dir, struct de
5042 iput(inode);
5043 return error;
5044 }
5045 +
5046 if (dir->i_mode & S_ISGID) {
5047 inode->i_gid = dir->i_gid;
5048 if (S_ISDIR(mode))
5049 inode->i_mode |= S_ISGID;
5050 }
5051 - dir->i_size += BOGO_DIRENT_SIZE;
5052 +
5053 + shmem_d_instantiate(dir, dentry, inode);
5054 +
5055 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5056 - d_instantiate(dentry, inode);
5057 - dget(dentry); /* Extra count - pin the dentry in core */
5058 }
5059 return error;
5060 }
5061 @@ -1868,12 +1981,11 @@ static int shmem_link(struct dentry *old
5062 if (ret)
5063 goto out;
5064
5065 - dir->i_size += BOGO_DIRENT_SIZE;
5066 + shmem_d_instantiate(dir, dentry, inode);
5067 +
5068 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5069 inc_nlink(inode);
5070 atomic_inc(&inode->i_count); /* New dentry reference */
5071 - dget(dentry); /* Extra pinning count for the created dentry */
5072 - d_instantiate(dentry, inode);
5073 out:
5074 return ret;
5075 }
5076 @@ -1882,21 +1994,63 @@ static int shmem_unlink(struct inode *di
5077 {
5078 struct inode *inode = dentry->d_inode;
5079
5080 - if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
5081 - shmem_free_inode(inode->i_sb);
5082 + if (d_is_whiteout(dentry) || d_is_fallthru(dentry) ||
5083 + (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)))
5084 + shmem_free_inode(dir->i_sb);
5085
5086 + if (inode) {
5087 + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5088 + drop_nlink(inode);
5089 + }
5090 dir->i_size -= BOGO_DIRENT_SIZE;
5091 - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5092 - drop_nlink(inode);
5093 dput(dentry); /* Undo the count from "create" - this does all the work */
5094 return 0;
5095 }
5096
5097 +static void shmem_dir_unlink_whiteouts(struct inode *dir, struct dentry *dentry)
5098 +{
5099 + if (!dentry->d_inode)
5100 + return;
5101 +
5102 + /* Remove whiteouts from logical empty directory */
5103 + if (S_ISDIR(dentry->d_inode->i_mode) &&
5104 + dentry->d_inode->i_sb->s_flags & MS_WHITEOUT) {
5105 + struct dentry *child, *next;
5106 + LIST_HEAD(list);
5107 +
5108 + spin_lock(&dcache_lock);
5109 + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
5110 + spin_lock(&child->d_lock);
5111 + /* Unlink fallthrus too */
5112 + if (d_is_whiteout(child) || d_is_fallthru(child)) {
5113 + __d_drop(child);
5114 + if (!list_empty(&child->d_lru)) {
5115 + list_del(&child->d_lru);
5116 + dentry_stat.nr_unused--;
5117 + }
5118 + list_add(&child->d_lru, &list);
5119 + }
5120 + spin_unlock(&child->d_lock);
5121 + }
5122 + spin_unlock(&dcache_lock);
5123 +
5124 + list_for_each_entry_safe(child, next, &list, d_lru) {
5125 + spin_lock(&child->d_lock);
5126 + list_del_init(&child->d_lru);
5127 + spin_unlock(&child->d_lock);
5128 +
5129 + shmem_unlink(dentry->d_inode, child);
5130 + }
5131 + }
5132 +}
5133 +
5134 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
5135 {
5136 if (!simple_empty(dentry))
5137 return -ENOTEMPTY;
5138
5139 + /* Remove whiteouts from logical empty directory */
5140 + shmem_dir_unlink_whiteouts(dir, dentry);
5141 drop_nlink(dentry->d_inode);
5142 drop_nlink(dir);
5143 return shmem_unlink(dir, dentry);
5144 @@ -1905,7 +2059,7 @@ static int shmem_rmdir(struct inode *dir
5145 /*
5146 * The VFS layer already does all the dentry stuff for rename,
5147 * we just have to decrement the usage count for the target if
5148 - * it exists so that the VFS layer correctly free's it when it
5149 + * it exists so that the VFS layer correctly frees it when it
5150 * gets overwritten.
5151 */
5152 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
5153 @@ -1916,7 +2070,12 @@ static int shmem_rename(struct inode *ol
5154 if (!simple_empty(new_dentry))
5155 return -ENOTEMPTY;
5156
5157 + if (d_is_whiteout(new_dentry))
5158 + shmem_unlink(new_dir, new_dentry);
5159 +
5160 if (new_dentry->d_inode) {
5161 + /* Remove whiteouts from logical empty directory */
5162 + shmem_dir_unlink_whiteouts(new_dir, new_dentry);
5163 (void) shmem_unlink(new_dir, new_dentry);
5164 if (they_are_dirs)
5165 drop_nlink(old_dir);
5166 @@ -1981,12 +2140,12 @@ static int shmem_symlink(struct inode *d
5167 unlock_page(page);
5168 page_cache_release(page);
5169 }
5170 +
5171 + shmem_d_instantiate(dir, dentry, inode);
5172 +
5173 if (dir->i_mode & S_ISGID)
5174 inode->i_gid = dir->i_gid;
5175 - dir->i_size += BOGO_DIRENT_SIZE;
5176 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5177 - d_instantiate(dentry, inode);
5178 - dget(dentry);
5179 return 0;
5180 }
5181
5182 @@ -2363,6 +2522,12 @@ int shmem_fill_super(struct super_block
5183 if (!root)
5184 goto failed_iput;
5185 sb->s_root = root;
5186 +
5187 +#ifdef CONFIG_TMPFS
5188 + if (!(sb->s_flags & MS_NOUSER))
5189 + sb->s_flags |= MS_WHITEOUT;
5190 +#endif
5191 +
5192 return 0;
5193
5194 failed_iput:
5195 @@ -2463,6 +2628,8 @@ static const struct inode_operations shm
5196 .rmdir = shmem_rmdir,
5197 .mknod = shmem_mknod,
5198 .rename = shmem_rename,
5199 + .whiteout = shmem_whiteout,
5200 + .fallthru = shmem_fallthru,
5201 #endif
5202 #ifdef CONFIG_TMPFS_POSIX_ACL
5203 .setattr = shmem_notify_change,
This page took 0.321557 seconds and 5 git commands to generate.