0%

process_vm_readv 源码分析

调用链

process_vm_rw->process_vm_rw_core->process_vm_rw_single_vec->process_vm_rw_pages->copy_page_to_iter->
copy_page_to_iter_iovec->copyout

核心代码

common/mm/process_vm_access.c
common/lib/iov_iter.c

版本号:android-4.14-stable

Linux manual page

process_vm_readv(2) - Linux manual page

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include <sys/uio.h>

ssize_t process_vm_readv(pid_t pid,
const struct iovec *local_iov,
unsigned long liovcnt,
const struct iovec *remote_iov,
unsigned long riovcnt,
unsigned long flags);
ssize_t process_vm_writev(pid_t pid,
const struct iovec *local_iov,
unsigned long liovcnt,
const struct iovec *remote_iov,
unsigned long riovcnt,
unsigned long flags);

定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
unsigned long, liovcnt, const struct iovec __user *, rvec,
unsigned long, riovcnt, unsigned long, flags)
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
}

SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
const struct iovec __user *, lvec,
unsigned long, liovcnt, const struct iovec __user *, rvec,
unsigned long, riovcnt, unsigned long, flags)
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
}

process_vm_access.c

common/mm/process_vm_access.c

process_vm_rw

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static ssize_t process_vm_rw(pid_t pid,
const struct iovec __user *lvec,
unsigned long liovcnt,
const struct iovec __user *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
{
struct iovec iovstack_l[UIO_FASTIOV];
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
struct iov_iter iter;
ssize_t rc;
int dir = vm_write ? WRITE : READ;

if (flags != 0)
return -EINVAL;

/* Check iovecs */
rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
if (rc < 0)
return rc;
if (!iov_iter_count(&iter))
goto free_iovecs;

rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;

rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);

free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
kfree(iov_l);

return rc;
}

检查iovecs后,调用process_vm_rw_core

process_vm_rw_core

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* Maximum number of entries for process pages array
which lives on stack */
#define PVM_MAX_PP_ARRAY_COUNT 16

/**
* process_vm_rw_core - core of reading/writing pages from task specified
* @pid: PID of process to read/write from/to
* @iter: where to copy to/from locally
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
* @vm_write: 0 if reading from other process, 1 if writing to other process
* Returns the number of bytes read/written or error code. May
* return less bytes than expected if an error occurs during the copying
* process.
*/
static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
const struct iovec *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
{
struct task_struct *task;
struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
struct page **process_pages = pp_stack;
struct mm_struct *mm;
unsigned long i;
ssize_t rc = 0;
unsigned long nr_pages = 0;
unsigned long nr_pages_iov;
ssize_t iov_len;
size_t total_len = iov_iter_count(iter);

/*
* Work out how many pages of struct pages we're going to need
* when eventually calling get_user_pages
*/
for (i = 0; i < riovcnt; i++) {
iov_len = rvec[i].iov_len;
if (iov_len > 0) {
nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ iov_len)
/ PAGE_SIZE - (unsigned long)rvec[i].iov_base
/ PAGE_SIZE + 1;
nr_pages = max(nr_pages, nr_pages_iov);
}
}

if (nr_pages == 0)
return 0;

if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
/* For reliability don't try to kmalloc more than
2 pages worth */
process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
sizeof(struct pages *)*nr_pages),
GFP_KERNEL);

if (!process_pages)
return -ENOMEM;
}

/* Get process information */
rcu_read_lock();
task = find_task_by_vpid(pid);
if (task)
get_task_struct(task);
rcu_read_unlock();
if (!task) {
rc = -ESRCH;
goto free_proc_pages;
}

mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
* Explicitly map EACCES to EPERM as EPERM is a more a
* appropriate error code for process_vw_readv/writev
*/
if (rc == -EACCES)
rc = -EPERM;
goto put_task_struct;
}

for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
iter, process_pages, mm, task, vm_write);

/* copied = space before - space after */
total_len -= iov_iter_count(iter);

/* If we have managed to copy any data at all then
we return the number of bytes copied. Otherwise
we return the error code */
if (total_len)
rc = total_len;

mmput(mm);

put_task_struct:
put_task_struct(task);

free_proc_pages:
if (process_pages != pp_stack)
kfree(process_pages);
return rc;
}
  1. 计算nr_pages后,与PVM_MAX_PP_ARRAY_COUNT比较,大于则调用kalloc申请内存,否则直接使用process_pages
  2. 调用find_task_by_vpidget_task_struct,获取进程的task_struct
  3. 调用mm_access(task, PTRACE_MODE_ATTACH_REALCREDS)获取mm_struct(同时进行权限检查)。
  4. 遍历rvec,调用process_vm_rw_single_vec

    process_vm_rw_single_vec

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    /* Maximum number of pages kmalloc'd to hold struct page's during copy */
    #define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)

    /**
    * process_vm_rw_single_vec - read/write pages from task specified
    * @addr: start memory address of target process
    * @len: size of area to copy to/from
    * @iter: where to copy to/from locally
    * @process_pages: struct pages area that can store at least
    * nr_pages_to_copy struct page pointers
    * @mm: mm for task
    * @task: task to read/write from
    * @vm_write: 0 means copy from, 1 means copy to
    * Returns 0 on success or on failure error code
    */
    static int process_vm_rw_single_vec(unsigned long addr,
    unsigned long len,
    struct iov_iter *iter,
    struct page **process_pages,
    struct mm_struct *mm,
    struct task_struct *task,
    int vm_write)
    {
    unsigned long pa = addr & PAGE_MASK;
    unsigned long start_offset = addr - pa;
    unsigned long nr_pages;
    ssize_t rc = 0;
    unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
    / sizeof(struct pages *);
    unsigned int flags = 0;

    /* Work out address and page range required */
    if (len == 0)
    return 0;
    nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;

    if (vm_write)
    flags |= FOLL_WRITE;

    while (!rc && nr_pages && iov_iter_count(iter)) {
    int pages = min(nr_pages, max_pages_per_loop);
    int locked = 1;
    size_t bytes;

    /*
    * Get the pages we're interested in. We must
    * access remotely because task/mm might not
    * current/current->mm
    */
    down_read(&mm->mmap_sem);
    pages = get_user_pages_remote(task, mm, pa, pages, flags,
    process_pages, NULL, &locked);
    if (locked)
    up_read(&mm->mmap_sem);
    if (pages <= 0)
    return -EFAULT;

    bytes = pages * PAGE_SIZE - start_offset;
    if (bytes > len)
    bytes = len;

    rc = process_vm_rw_pages(process_pages,
    start_offset, bytes, iter,
    vm_write);
    len -= bytes;
    start_offset = 0;
    nr_pages -= pages;
    pa += pages * PAGE_SIZE;
    while (pages)
    put_page(process_pages[--pages]);
    }

    return rc;
    }
    通过get_user_pages_remote获取pages(调用链为__get_user_pages_locked->__get_user_pages),然后通过process_vm_rw_pages读写物理内存。

__get_user_pages

1
2
3
4
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)

process_vm_rw_pages

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
* process_vm_rw_pages - read/write pages from task specified
* @pages: array of pointers to pages we want to copy
* @start_offset: offset in page to start copying from/to
* @len: number of bytes to copy
* @iter: where to copy to/from locally
* @vm_write: 0 means copy from, 1 means copy to
* Returns 0 on success, error code otherwise
*/
static int process_vm_rw_pages(struct page **pages,
unsigned offset,
size_t len,
struct iov_iter *iter,
int vm_write)
{
/* Do the copy for each page */
while (len && iov_iter_count(iter)) {
struct page *page = *pages++;
size_t copy = PAGE_SIZE - offset;
size_t copied;

if (copy > len)
copy = len;

if (vm_write) {
copied = copy_page_from_iter(page, offset, copy, iter);
set_page_dirty_lock(page);
} else {
copied = copy_page_to_iter(page, offset, copy, iter);
}
len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
offset = 0;
}
return 0;
}

copy_page_to_iter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
if (likely(iter_is_iovec(i)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
void *kaddr = kmap_local_page(page);
size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
kunmap_local(kaddr);
return wanted;
}
if (iov_iter_is_pipe(i))
return copy_page_to_iter_pipe(page, offset, bytes, i);
if (unlikely(iov_iter_is_discard(i))) {
if (unlikely(i->count < bytes))
bytes = i->count;
i->count -= bytes;
return bytes;
}
WARN_ON(1);
return 0;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
size_t res = 0;
if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
page += offset / PAGE_SIZE; // first subpage
offset %= PAGE_SIZE;
while (1) {
size_t n = __copy_page_to_iter(page, offset,
min(bytes, (size_t)PAGE_SIZE - offset), i);
res += n;
bytes -= n;
if (!bytes || !n)
break;
offset += n;
if (offset == PAGE_SIZE) {
page++;
offset = 0;
}
}
return res;
}

iov_iter.c

common/lib/iov_iter.c

copy_page_to_iter_iovec

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
size_t skip, copy, left, wanted;
const struct iovec *iov;
char __user *buf;
void *kaddr, *from;

if (unlikely(bytes > i->count))
bytes = i->count;

if (unlikely(!bytes))
return 0;

might_fault();
wanted = bytes;
iov = i->iov;
skip = i->iov_offset;
buf = iov->iov_base + skip;
copy = min(bytes, iov->iov_len - skip);

if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
kaddr = kmap_atomic(page);
from = kaddr + offset;

/* first chunk, usually the only one */
left = copyout(buf, from, copy);
copy -= left;
skip += copy;
from += copy;
bytes -= copy;

while (unlikely(!left && bytes)) {
iov++;
buf = iov->iov_base;
copy = min(bytes, iov->iov_len);
left = copyout(buf, from, copy);
copy -= left;
skip = copy;
from += copy;
bytes -= copy;
}
if (likely(!bytes)) {
kunmap_atomic(kaddr);
goto done;
}
offset = from - kaddr;
buf += copy;
kunmap_atomic(kaddr);
copy = min(bytes, iov->iov_len - skip);
}
/* Too bad - revert to non-atomic kmap */

kaddr = kmap(page);
from = kaddr + offset;
left = copyout(buf, from, copy);
copy -= left;
skip += copy;
from += copy;
bytes -= copy;
while (unlikely(!left && bytes)) {
iov++;
buf = iov->iov_base;
copy = min(bytes, iov->iov_len);
left = copyout(buf, from, copy);
copy -= left;
skip = copy;
from += copy;
bytes -= copy;
}
kunmap(page);

done:
if (skip == iov->iov_len) {
iov++;
skip = 0;
}
i->count -= wanted - bytes;
i->nr_segs -= iov - i->iov;
i->iov = iov;
i->iov_offset = skip;
return wanted - bytes;
}

copyout

1
2
3
4
5
6
7
8
9
10
static int copyout(void __user *to, const void *from, size_t n)
{
if (should_fail_usercopy())
return n;
if (access_ok(to, n)) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
}
return n;
}