最近在啃APUE,看的有点云里雾里,结合具体实际例子整理对写时复制技术的理解,以免遗忘,如有错误欢迎反馈,文章不定时更新内容。

COW介绍

fork创建的子进程与父进程共享内存空间。fork()之后,kernel把父进程中所有的内存页的权限都设为read-only,然后子进程的地址空间指向父进程。当父子进程都只读内存时,相安无事。当其中某个进程写内存时,CPU硬件检测到内存页是read-only的,于是触发页异常中断(page-fault),陷入kernel的一个中断例程。中断例程中,kernel就会把触发的异常的页复制一份,于是父子进程各自持有独立的一份。

虽然最开始接触COW是在fork时,但COW应当属于一种通用的技术,比如在redis、早期C++ STL、存储快照等技术中得以应用。

通俗的解释,假定多方需要使用同一个资源时,没有必要为每一方都创建该资源的一个完整的副本,反而令多方共享这个资源,当某方需要修改资源的某处时,利用引用计数,把该处复制一个副本,再把更新的内容写入该副本中,从而节省创建多个完整副本时带来的空间和时间上的开销。

 Resource 

                             +-------+
                             |   1   |
                             +-------+
User A   ---->               |   2   |               <---- User B
                             +-------+          
                             |   3   |
                             +-------+                             

当 A 需更新 Block 1 时,A 新建 New 1 Block,并把新的内容写入该 Block,从而避免影响 B 的 Block 1。同理,当 B 需更新 Block 3 时,B 新建 New 3 Block,并把新的内容写入该 Block,避免影响 A 的 Block 3。

COW好处

Copy On Write的好处:
COW技术可以减少fork时带来的资源开销(后面有例子验证)。子进程往往都会执行exec()来做自己想做的事情,复制父进程的代码段无意义。

STL string Copy On Write

C ++ 曾在性能问题上被广泛地质疑指责过,为了提升性能,STL 的许多类采用了 Copy-On-Write 技术。Coolshell C++ STL string 的 Copy-On-Write 技术 详细的从 C++ 层面介绍改技术。
代码:

#include <stdio.h>
#include <string>
using namespace std;

main()
{
    string str1 = "hello world";
    string str2 = str1;

    printf ("Sharing the memory:\n");
    printf ("\tstr1's address: %p\n", str1.c_str() );
    printf ("\tstr2's address: %p\n", str2.c_str() );

    str1[1]='q';
    str2[1]='w';

    printf ("After Copy-On-Write:\n");
    printf ("\tstr1's address: %p\n", str1.c_str() );
    printf ("\tstr2's address: %p\n", str2.c_str() );
}

在ubuntu 18.04 gcc版本gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)运行结果如下:

Sharing the memory:
    str1's address: d777deb0
    str2's address: d777ded0
After Copy-On-Write:
    str1's address: d777deb0
    str2's address: d777ded0

看了原文评论,原来由于STL中使用COW所带来的副作用,在后面版本已经去掉了这个特性,其中最主要的一个原因是因为线程不安全。

我们可以在docker环境中安装gcc4.8进行验证:

[root workspace]#docker pull gcc:4.8
4.8: Pulling from library/gcc
47994b92ab73: Pull complete
a3ed95caeb02: Pull complete
9b7b75987c3c: Pull complete
d66c4af59bfb: Pull complete
26df7d6a7371: Pull complete
d4aa50056c0d: Pull complete
bfc6c0d1879f: Pull complete
f7667401f7bb: Pull complete
51ae28b91c03: Pull complete
Digest: sha256:43c6eaefe26ad414b2eefbee68688d1467823424cacbbd1703da4a50ce2c7654
Status: Downloaded newer image for gcc:4.8

[root workspace]#docker run -t -i gcc:4.8 /bin/bash

运行容器之后,在另一个会话中,要将cpp文件拷贝到容器中
docker cp /root/workspace/strcpp.cpp e93669e93174:/

root@e93669e93174:/# g++ -g -o strcpp strcpp.cpp
root@e93669e93174:/# ./strcpp
Sharing the memory:
    str1's address: 0x210d028
    str2's address: 0x210d028
After Copy-On-Write:
    str1's address: 0x210d058
    str2's address: 0x210d028
root@e93669e93174:/#
root@e93669e93174:/# g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.8.5/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: /usr/src/gcc/configure --disable-multilib --enable-languages=c,c++
Thread model: posix
gcc version 4.8.5 (GCC)

我们可以看到在gcc 4.8版本中运行上述代码,在最开始修改str1中的内容后,str的地址发生了变化。如果先修改的是str2的地址,那么str2的地址会变化。这里我们可以验证c++ stl string类中原本的COW特性。

fork COW验证

#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

size_t virtual_to_physical(size_t addr)
{
    int fd = open("/proc/self/pagemap", O_RDONLY);
    if(fd < 0)
    {
        printf("open '/proc/self/pagemap' failed!\n");
        return 0;
    }
    size_t pagesize = getpagesize();
    size_t offset = (addr / pagesize) * sizeof(uint64_t);
    if(lseek(fd, offset, SEEK_SET) < 0)
    {
        printf("lseek() failed!\n");
        close(fd);
        return 0;
    }
    uint64_t info;
    if(read(fd, &info, sizeof(uint64_t)) != sizeof(uint64_t))
    {
        printf("read() failed!\n");
        close(fd);
        return 0;
    }
    //判断64位:page present
    if((info & (((uint64_t)1) << 63)) == 0)
    {
        printf("page is not present!\n");
        close(fd);
        return 0;
    }
    //取0-54位 page frame number(PFN) if present
    size_t frame = info & ((((uint64_t)1) << 55) - 1);
    //乘以页大小,运行环境4096,即左移12位,可以用左移实现
    size_t phy = frame * pagesize + addr % pagesize;
    close(fd);
    return phy;
}

int main()
{
    char* str = malloc(128);
    strcpy(str,"hello,world!");
    printf("original, vir = %p, phy = %p, val = '%s'\n",
        str, (void*)virtual_to_physical((size_t)str), str);
    pid_t pid = fork();
    if(pid < 0)
    {
        printf("fork() failed!\n");
        return 1;
    }
    else if(pid > 0) 
    {
        printf("father,   vir = %p, phy = %p, val = '%s'\n",
            str, (void*)virtual_to_physical((size_t)str), str);
        wait(NULL);
    }
    else
    {
        printf("child,    vir = %p, phy = %p, val = '%s'\n",
            str, (void*)virtual_to_physical((size_t)str), str);
    }
    return 0;
}

运行结果:

[root workspace]#./fork
original, vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'
father,   vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'
child,    vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'

验证如果父进程修改了内存里的值,代码如下:

#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

size_t virtual_to_physical(size_t addr)
{
    int fd = open("/proc/self/pagemap", O_RDONLY);
    if(fd < 0)
    {
        printf("open '/proc/self/pagemap' failed!\n");
        return 0;
    }
    size_t pagesize = getpagesize();
    size_t offset = (addr / pagesize) * sizeof(uint64_t);
    if(lseek(fd, offset, SEEK_SET) < 0)
    {
        printf("lseek() failed!\n");
        close(fd);
        return 0;
    }
    uint64_t info;
    if(read(fd, &info, sizeof(uint64_t)) != sizeof(uint64_t))
    {
        printf("read() failed!\n");
        close(fd);
        return 0;
    }
    //判断64位:page present
    if((info & (((uint64_t)1) << 63)) == 0)
    {
        printf("page is not present!\n");
        close(fd);
        return 0;
    }
    //取0-54位 page frame number(PFN) if present
    size_t frame = info & ((((uint64_t)1) << 55) - 1);
    //乘以页大小,运行环境4096,即左移12位,可以用左移实现
    size_t phy = frame * pagesize + addr % pagesize;
    close(fd);
    return phy;
}

int main()
{
    char* str = malloc(128);
    strcpy(str,"hello,world!");
    printf("original, vir = %p, phy = %p, val = '%s'\n",
        str, (void*)virtual_to_physical((size_t)str), str);
    pid_t pid = fork();
    if(pid < 0)
    {
        printf("fork() failed!\n");
        return 1;
    }
    else if(pid > 0) 
    {
        str[0] = 'H';
        printf("father,   vir = %p, phy = %p, val = '%s'\n",
            str, (void*)virtual_to_physical((size_t)str), str);
        wait(NULL);
    }
    else
    {
        printf("child,    vir = %p, phy = %p, val = '%s'\n",
            str, (void*)virtual_to_physical((size_t)str), str);
    }
    return 0;
}

运行结果:

[root workspace]#gcc -g -o fork fork.c
[root workspace]#./fork
original, vir = 0x5566be6e5260, phy = 0x49f85260, val = 'hello,world!'
father,   vir = 0x5566be6e5260, phy = 0x47df8260, val = 'Hello,world!'
child,    vir = 0x5566be6e5260, phy = 0x49f85260, val = 'hello,world!'

父进程中物理地址变了,而子进程将继承本来的物理地址,同样可以验证,子进程修改str,结论:
谁去修改内存,谁得到新的内存。

那么到底拷贝多大的内存呢?

#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>

void dump_pfns(char* addr, size_t len)
{
    int fd = open("/proc/self/pagemap", O_RDONLY);
    if(fd < 0)
    {
        printf("open '/proc/self/pagemap' failed!\n");
        return;
    }
    size_t pagesize = getpagesize();
    size_t offset = ((size_t)addr / pagesize) * sizeof(uint64_t);
    if(lseek(fd, offset, SEEK_SET) < 0)
    {
        printf("lseek() failed!\n");
        close(fd);
        return;
    }
    size_t pages = (len - 1) / pagesize + 1;
    uint64_t info[pages];
    if(read(fd, info, sizeof(info)) != sizeof(info))
    {
        printf("read() failed!\n");
        close(fd);
        return;
    }
    close(fd);
    for(size_t i = 0; i < pages; i++)
    {
        size_t frame = info[i] & ((((uint64_t)1) << 55) - 1);
        if((info[i] & (((uint64_t)1) << 63)) == 0)
            printf("page is not present!\n");
        else
            printf("page frame number: %lu\n", frame);
    }
}

int main()
{
    size_t len = 65536;
    char* str = malloc(len);
    memset(str, 0, len);
    printf("original:\n");
    dump_pfns(str, len);

    pid_t pid = fork();
    if(pid < 0)
    {
        printf("fork() failed!\n");
        return 1;
    }
    else if(pid > 0)
    {
        printf("father:\n");
        str[0]='H';
        dump_pfns(str, len);
        wait(0);
    }
    else
    {
        sleep(1);
        printf("child:\n");
        dump_pfns(str, len);
    }
    return 0;
}

运行结果:

[root workspace]#./fork
original:
page frame number: 204864
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858
father:
page frame number: 395474
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858
child:
page frame number: 204864
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858

可以看到上面只有一个页地址不同,由此可以得出结论,只需要复制已修改的部分所占的页数。

参考文章

Copy On Write机制了解一下
C++ STL string 的 Copy-On-Write 技术
pagemap
addr_trans
translate Virtual Address to Physical Address in User Space