进程地址空间

前言

内核除了需要管理自己使用的物理内存，还要管理用户空间中进程使用的内存。这部分内存称为进程地址空间。由于使用虚拟内存，所以每个进程都认为自己拥有整个物理内存。

进程的地址空间

进程的内存地址空间由可寻址的虚拟内存组成，根据架构不同有32位或64位的独立连续的地址空间（flat）。两个内存可能有相同的内存地址，但其实互不相干，这种称为线程。

尽管进程可以访问2^32GB 或者 2^64 GB 的虚拟内存，但是不代表进程有权限访问所有虚拟地址，一个进程可以访问的合法地址空间称为memory areas 内存区域.进程可以动态的增加或减少自己的内存区域。

如果进程访问了有效内存区域外的内存地址，内核会终止进程并报“Segmentation Fault”。

进程的内存区域包含一下内存对象：

A memory map of the executable file’s code, called the text section.
A memory map of the executable file’s initialized global variables, called the data section.
A memory map of the zero page (a page consisting of all zeros, used for purposes such as this) containing uninitialized global variables, called the bss section.1
A memory map of the zero page used for the process’s user-space stack. (Do not confuse this with the process’s kernel stack, which is separate and maintained and used by the kernel.)
An additional text, data, and bss section for each shared library, such as the C library and dynamic linker, loaded into the process’s address space.
Any memory mapped files.
Any shared memory segments.
Any anonymous memory mappings, such as those associated with malloc().

内存描述符 mm_struct

mm_struct 结构体用来存放进程地址空间的所有信息。通常每个进程都有唯一的 mm_struct.所有的 mm_struct 结构体通过 mmlist 双向链表链接，链表的首元素是 init_mm 内存描述符，代表init 进程的地址空间，操作该链表需要使用 mmlist_lock 锁防止并发访问。

mm_struct 源码

struct mm_struct {

    //指向线性区对象的链表头
    struct vm_area_struct * mmap;       /* list of VMAs */
    //指向线性区对象的红黑树
    struct rb_root mm_rb;
    //指向最近找到的虚拟区间
    struct vm_area_struct * mmap_cache; /* last find_vma result */

    //用来在进程地址空间中搜索有效的进程地址空间的函数
    unsigned long (*get_unmapped_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);

       unsigned long (*get_unmapped_exec_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);

    //释放线性区时调用的方法，          
    void (*unmap_area) (struct mm_struct *mm, unsigned long addr);

    //标识第一个分配文件内存映射的线性地址
    unsigned long mmap_base;        /* base of mmap area */


    unsigned long task_size;        /* size of task vm space */
    /*
     * RHEL6 special for bug 790921: this same variable can mean
     * two different things. If sysctl_unmap_area_factor is zero,
     * this means the largest hole below free_area_cache. If the
     * sysctl is set to a positive value, this variable is used
     * to count how much memory has been munmapped from this process
     * since the last time free_area_cache was reset back to mmap_base.
     * This is ugly, but necessary to preserve kABI.
     */
    unsigned long cached_hole_size;

    //内核进程搜索进程地址空间中线性地址的空间空间
    unsigned long free_area_cache;      /* first hole of size cached_hole_size or larger */

    //指向页表的目录
    pgd_t * pgd;

    //共享进程时的个数
    atomic_t mm_users;          /* How many users with user space? */

    //内存描述符的主使用计数器，采用引用计数的原理，当为0时代表无用户再次使用
    atomic_t mm_count;          /* How many references to "struct mm_struct" (users count as 1) */

    //线性区的个数
    int map_count;              /* number of VMAs */

    struct rw_semaphore mmap_sem;

    //保护任务页表和引用计数的锁
    spinlock_t page_table_lock;     /* Protects page tables and some counters */

    //mm_struct结构，第一个成员就是初始化的mm_struct结构，
    struct list_head mmlist;        /* List of maybe swapped mm's.  These are globally strung
                         * together off init_mm.mmlist, and are protected
                         * by mmlist_lock
                         */

    /* Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */

    mm_counter_t _file_rss;
    mm_counter_t _anon_rss;
    mm_counter_t _swap_usage;

    //进程拥有的最大页表数目
    unsigned long hiwater_rss;  /* High-watermark of RSS usage */、
    //进程线性区的最大页表数目
    unsigned long hiwater_vm;   /* High-water virtual memory usage */

    //进程地址空间的大小，锁住无法换页的个数，共享文件内存映射的页数，可执行内存映射中的页数
    unsigned long total_vm, locked_vm, shared_vm, exec_vm;
    //用户态堆栈的页数，
    unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
    //维护代码段和数据段
    unsigned long start_code, end_code, start_data, end_data;
    //维护堆和栈
    unsigned long start_brk, brk, start_stack;
    //维护命令行参数，命令行参数的起始地址和最后地址，以及环境变量的起始地址和最后地址
    unsigned long arg_start, arg_end, env_start, env_end;

    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

    struct linux_binfmt *binfmt;

    cpumask_t cpu_vm_mask;

    /* Architecture-specific MM context */
    mm_context_t context;

    /* Swap token stuff */
    /*
     * Last value of global fault stamp as seen by this process.
     * In other words, this value gives an indication of how long
     * it has been since this task got the token.
     * Look at mm/thrash.c
     */
    unsigned int faultstamp;
    unsigned int token_priority;
    unsigned int last_interval;

    //线性区的默认访问标志
    unsigned long flags; /* Must use atomic bitops to access the bits */

    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t      ioctx_lock;
    struct hlist_head   ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file *exe_file;
    unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
    /* reserved for Red Hat */
#ifdef __GENKSYMS__
    unsigned long rh_reserved[2];
#else
    /* How many tasks sharing this mm are OOM_DISABLE */
    union {
        unsigned long rh_reserved_aux;
        atomic_t oom_disable_count;
    };

    /* base of lib map area (ASCII armour) */
    unsigned long shlib_base;
#endif
};

内存描述符的分配

task_struct 中 mm 域中存放该进程的内存描述符。fork()函数调用 copy_mm()复制父进程的内存描述符。子进程实际是通过kernel/fork.c 文件的 allocate_mm() 函数从 mm_cachep slab cache 中分配。

如果父进程希望和子进程共享地址空间，在调用 clone()时设置 CLONE_VM 标志，这样的进程就是线程，是否共享地址空间，也是进程和线程唯一的区别。

指定 CLONE_VM 后，不再调用allocate_mm()，

if (clone_flags & CLONE_VM) { 
/*
    * current is the parent process and
    * tsk is the child process during a fork()
*/ 
atomic_inc(&current->mm->mm_users); 
tsk->mm = current->mm;
}

内存描述符的撤销

进程退出调用 kernel/exit.c 中的 exit_mm()，其中 mmput() -1 mm_user 计数，到0后，mmdrop() -1 mm_count 计数。也为0后代表无人使用。free_mm() 宏通过kmem_cache_free() 将 mm_struct 结构体归还到 mm_cachep slab cache.

内核线程

内核线程没有进程地址空间，也没有自己的 mm_struct。内核线程的 mm域为空。这也是内核线程的真实含义——没有用户上下文。
内核线程使用调度前一个进程的内存描述符，内核发现 mm域为 NULL 时，保留前一个进程地址空间，内核线程不访问用户空间内存，只使用地址空间和内核内存相关的信息，那部分对于所有进程都是一样的。

虚拟内存区域(Virtual Memory Areas)

VMA 由结构体vm_area_struct 表示，描述一个指定地址空间的内连续的独立的内存范围。VMA 中 vm_mm 指向对应的 mm_struct,VMA 对于指向的mm_struct 是独一无二的。

每个 VMA 作为一个单独的内存对象管理，有一致的属性。每一个 VMA 可以代表不同类型的内存区域，比如内存映射文件或进程用户空间栈。

struct vm_area_struct {
        /* The first cache line has the info for VMA tree walking. */

        unsigned long vm_start;                /* Our start address within vm_mm. */
        unsigned long vm_end;                /* The first byte after our end address within vm_mm. */

        /* linked list of VM areas per task, sorted by address */
        struct vm_area_struct *vm_next, *vm_prev;

        struct rb_node vm_rb;
        /*
         * Largest free memory gap in bytes to the left of this VMA.
         * Either between this VMA and vma->vm_prev, or between one of the
         * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
         * get_unmapped_area find a free area of the right size.
         */
        unsigned long rb_subtree_gap;

        /* Second cache line starts here. */

        struct mm_struct *vm_mm;        /* The address space we belong to. */
        pgprot_t vm_page_prot;                /* Access permissions of this VMA. */
        unsigned long vm_flags;                /* Flags, see mm.h. */

        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree, or
         * linkage of vma in the address_space->i_mmap_nonlinear list.
         */
        union {
                struct {
                        struct rb_node rb;
                        unsigned long rb_subtree_last;
                } linear;
                struct list_head nonlinear;
        } shared;

        /*
         * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
         * list, after a COW of one of the file pages.        A MAP_SHARED vma
         * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
         * or brk vma (with NULL file) can only be in an anon_vma list.
         */
        struct list_head anon_vma_chain; /* Serialized by mmap_sem &
                                          * page_table_lock */
        struct anon_vma *anon_vma;        /* Serialized by page_table_lock */

        /* Function pointers to deal with this struct. */
        const struct vm_operations_struct *vm_ops;

        /* Information about our backing store: */
        unsigned long vm_pgoff;                /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */
        struct file * vm_file;                /* File we map to (can be NULL). */
        void * vm_private_data;                /* was vm_pte (shared mem) */

VMA 操作函数

上面的结构体中 vm_ops 指向 VMA 结构体一些操作函数，和 VFS 一样，不同类型的 vma 实现特定的实例方法。


struct vm_operations_struct {
void (*open) (struct vm_area_struct *); //given memory area is added to an address space
void (*close) (struct vm_area_struct *); //given memory area is removed from an address space
int (*fault) (struct vm_area_struct *, struct vm_fault *); //invoked by the page fault handler when a page that is not present in physical memory is accessed
int (*page_mkwrite) (struct vm_area_struct *vma, struct vm_fault *vmf); 
int (*access) (struct vm_area_struct *, unsigned long ,void *, int, int);//invoked by access_process_vm() when get_user_pages() fails
};

VMA 的树形结构和链表结构

VMA 通过 mmap 和 mm_rb 来访问内存区域，mmap 通过链表的形式存放 VMA 结构体，主要用于遍历，按照地址增长排序，mmap 执行链表第一个内存区域。mm_rb 通过红黑树来存放 VMA 结构体，mm_rb 指向红黑树根节点，红黑数主要用于定位特定内存区域。两种结构中存放完全相同的 vm_area_struct 结构体,只是数据结构不同。

查看实际进程内存区域

使用 /proc/PID/maps 或者 pmap 命令可以查看给定进程使用的内存空间以及使用内存的程序/库等。

# pmap -x 5371
5371:   nginx: worker process                
Address           Kbytes     RSS   Dirty Mode   Mapping
0000000000400000     564     344       0 r-x--  nginx          //代码段
000000000068c000      68      68      60 rw---  nginx          //数据段
000000000069d000      56      12      12 rw---    [ anon ]
000000000a0c8000    1812    1684    1684 rw---    [ anon ]
0000003ac0a00000     112      40       0 r-x--  ld-2.5.so      //代码段
0000003ac0c1c000       4       4       4 r----  ld-2.5.so      //数据段
0000003ac0c1d000       4       4       4 rw---  ld-2.5.so      //bss 段
0000003ac0e00000    1340     284       0 r-x--  libc-2.5.so
0000003ac0f4f000    2044       0       0 -----  libc-2.5.so
0000003ac114e000      16      16       8 r----  libc-2.5.so
0000003ac1152000       4       4       4 rw---  libc-2.5.so
0000003ac1153000      20      20      20 rw---    [ anon ]
00002b5751c3d000       4       4       4 rw-s-  zero (deleted)
00002b5751c3e000   20012   20000   20000 rw---    [ anon ]
00007fffbf2ce000      84      20      20 rw---    [ stack ]    //进程的栈
00007fffbf35e000      12       0       0 r-x--    [ anon ]
ffffffffff600000    8192       0       0 -----    [ anon ]
----------------  ------  ------  ------
total kB           72880   22940   22000

可以看到代码段是 rx 权限，数据段和 bss 有 rw 权限，堆栈可能有 rwx 的权限。

上述 C 库所占有的内存是共享的不可写的，实际属于这个进程的私有物理进程很少，这样可以减少大量内存。

操作内存的函数

find_vma()

find_vma()用于找到给定的内存地址属于哪一个内存区域，返回一个vm_area_struct 结构体

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = NULL;

        /* Check the cache first. */
        /* (Cache hit rate is typically around 35%.) */
        vma = ACCESS_ONCE(mm->mmap_cache);
        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
                struct rb_node *rb_node;    //如果缓存未命中开始查找红黑树

                rb_node = mm->mm_rb.rb_node;     
                vma = NULL;

                while (rb_node) {
                        struct vm_area_struct *vma_tmp;

                        vma_tmp = rb_entry(rb_node,
                                           struct vm_area_struct, vm_rb);

                        if (vma_tmp->vm_end > addr) {
                                vma = vma_tmp;
                                if (vma_tmp->vm_start <= addr)
                                        break;
                                rb_node = rb_node->rb_left;
                        } else
                                rb_node = rb_node->rb_right;
                }
                if (vma)
                        mm->mmap_cache = vma;
        }
        return vma;
}

mmap() and do_mmap() 创建地址空间

do_mmap() 函数创建一个新的线性地址空间，但并不一定创建新的 VMA ，如果创建的 VMA 和已存在的地址区间相邻且相同权限，会合并为一个。否则创建新的 VMA，从 vm_area_cachep slab cache 中分配一个vm_area_struct，然后通过vma_link()加入到链表和红黑树中，更新total_vm,最后返回新的地址区间的初始地址。

unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot,unsigned long flag, unsigned long offset)

//file 映射的文件
//offset 具体映射从文件的偏移量开始,长度为 lan
//file 和 offset 都为0 是匿名映射
//prot 内存访问权限
//flag VMA 标志

在用户空间通过mmap()系统调用获取内核函数do_mmap()的功能。

do_munmap()

do_munmap() 删除地址空间,从 start 开始删除 len 长。

1	int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

页表

虽然程序使用的是虚拟内存，但是 cpu 需要操作物理内存，虚拟内存到物理内存的映射使用页表，Linux 使用三级页表，可以节省页表所占用的空间。

顶级页表也是一级页表(PGD)指向二级页表(PMD)指向最后的页表(PTE)指向物理页面。

多数体系结构，查找页表是硬件完成的，每个进程的内存描述符中 pgd 指向一级页表。页表对应的结构体依赖体系结构。

TLB ：虚拟地址到物理地址映射的硬件缓存