最近在啃APUE,看的有点云里雾里,结合具体实际例子整理对写时复制技术的理解,以免遗忘,如有错误欢迎反馈,文章不定时更新内容。
COW介绍
fork创建的子进程与父进程共享内存空间。fork()之后,kernel把父进程中所有的内存页的权限都设为read-only,然后子进程的地址空间指向父进程。当父子进程都只读内存时,相安无事。当其中某个进程写内存时,CPU硬件检测到内存页是read-only的,于是触发页异常中断(page-fault),陷入kernel的一个中断例程。中断例程中,kernel就会把触发的异常的页复制一份,于是父子进程各自持有独立的一份。
虽然最开始接触COW是在fork时,但COW应当属于一种通用的技术,比如在redis、早期C++ STL、存储快照等技术中得以应用。
通俗的解释,假定多方需要使用同一个资源时,没有必要为每一方都创建该资源的一个完整的副本,反而令多方共享这个资源,当某方需要修改资源的某处时,利用引用计数,把该处复制一个副本,再把更新的内容写入该副本中,从而节省创建多个完整副本时带来的空间和时间上的开销。
Resource
+-------+
| 1 |
+-------+
User A ----> | 2 | <---- User B
+-------+
| 3 |
+-------+
当 A 需更新 Block 1 时,A 新建 New 1 Block,并把新的内容写入该 Block,从而避免影响 B 的 Block 1。同理,当 B 需更新 Block 3 时,B 新建 New 3 Block,并把新的内容写入该 Block,避免影响 A 的 Block 3。
COW好处
Copy On Write的好处:
COW技术可以减少fork时带来的资源开销(后面有例子验证)。子进程往往都会执行exec()来做自己想做的事情,复制父进程的代码段无意义。
STL string Copy On Write
C ++ 曾在性能问题上被广泛地质疑指责过,为了提升性能,STL 的许多类采用了 Copy-On-Write 技术。Coolshell C++ STL string 的 Copy-On-Write 技术 详细的从 C++ 层面介绍改技术。
代码:
#include <stdio.h>
#include <string>
using namespace std;
main()
{
string str1 = "hello world";
string str2 = str1;
printf ("Sharing the memory:\n");
printf ("\tstr1's address: %p\n", str1.c_str() );
printf ("\tstr2's address: %p\n", str2.c_str() );
str1[1]='q';
str2[1]='w';
printf ("After Copy-On-Write:\n");
printf ("\tstr1's address: %p\n", str1.c_str() );
printf ("\tstr2's address: %p\n", str2.c_str() );
}
在ubuntu 18.04 gcc版本gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)运行结果如下:
Sharing the memory:
str1's address: d777deb0
str2's address: d777ded0
After Copy-On-Write:
str1's address: d777deb0
str2's address: d777ded0
看了原文评论,原来由于STL中使用COW所带来的副作用,在后面版本已经去掉了这个特性,其中最主要的一个原因是因为线程不安全。
我们可以在docker环境中安装gcc4.8进行验证:
[root workspace]#docker pull gcc:4.8
4.8: Pulling from library/gcc
47994b92ab73: Pull complete
a3ed95caeb02: Pull complete
9b7b75987c3c: Pull complete
d66c4af59bfb: Pull complete
26df7d6a7371: Pull complete
d4aa50056c0d: Pull complete
bfc6c0d1879f: Pull complete
f7667401f7bb: Pull complete
51ae28b91c03: Pull complete
Digest: sha256:43c6eaefe26ad414b2eefbee68688d1467823424cacbbd1703da4a50ce2c7654
Status: Downloaded newer image for gcc:4.8
[root workspace]#docker run -t -i gcc:4.8 /bin/bash
运行容器之后,在另一个会话中,要将cpp文件拷贝到容器中
docker cp /root/workspace/strcpp.cpp e93669e93174:/
root@e93669e93174:/# g++ -g -o strcpp strcpp.cpp
root@e93669e93174:/# ./strcpp
Sharing the memory:
str1's address: 0x210d028
str2's address: 0x210d028
After Copy-On-Write:
str1's address: 0x210d058
str2's address: 0x210d028
root@e93669e93174:/#
root@e93669e93174:/# g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.8.5/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: /usr/src/gcc/configure --disable-multilib --enable-languages=c,c++
Thread model: posix
gcc version 4.8.5 (GCC)
我们可以看到在gcc 4.8版本中运行上述代码,在最开始修改str1中的内容后,str的地址发生了变化。如果先修改的是str2的地址,那么str2的地址会变化。这里我们可以验证c++ stl string类中原本的COW特性。
fork COW验证
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
size_t virtual_to_physical(size_t addr)
{
int fd = open("/proc/self/pagemap", O_RDONLY);
if(fd < 0)
{
printf("open '/proc/self/pagemap' failed!\n");
return 0;
}
size_t pagesize = getpagesize();
size_t offset = (addr / pagesize) * sizeof(uint64_t);
if(lseek(fd, offset, SEEK_SET) < 0)
{
printf("lseek() failed!\n");
close(fd);
return 0;
}
uint64_t info;
if(read(fd, &info, sizeof(uint64_t)) != sizeof(uint64_t))
{
printf("read() failed!\n");
close(fd);
return 0;
}
//判断64位:page present
if((info & (((uint64_t)1) << 63)) == 0)
{
printf("page is not present!\n");
close(fd);
return 0;
}
//取0-54位 page frame number(PFN) if present
size_t frame = info & ((((uint64_t)1) << 55) - 1);
//乘以页大小,运行环境4096,即左移12位,可以用左移实现
size_t phy = frame * pagesize + addr % pagesize;
close(fd);
return phy;
}
int main()
{
char* str = malloc(128);
strcpy(str,"hello,world!");
printf("original, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
pid_t pid = fork();
if(pid < 0)
{
printf("fork() failed!\n");
return 1;
}
else if(pid > 0)
{
printf("father, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
wait(NULL);
}
else
{
printf("child, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
}
return 0;
}
运行结果:
[root workspace]#./fork
original, vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'
father, vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'
child, vir = 0x55a4b7662260, phy = 0x5d66d260, val = 'hello,world!'
验证如果父进程修改了内存里的值,代码如下:
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
size_t virtual_to_physical(size_t addr)
{
int fd = open("/proc/self/pagemap", O_RDONLY);
if(fd < 0)
{
printf("open '/proc/self/pagemap' failed!\n");
return 0;
}
size_t pagesize = getpagesize();
size_t offset = (addr / pagesize) * sizeof(uint64_t);
if(lseek(fd, offset, SEEK_SET) < 0)
{
printf("lseek() failed!\n");
close(fd);
return 0;
}
uint64_t info;
if(read(fd, &info, sizeof(uint64_t)) != sizeof(uint64_t))
{
printf("read() failed!\n");
close(fd);
return 0;
}
//判断64位:page present
if((info & (((uint64_t)1) << 63)) == 0)
{
printf("page is not present!\n");
close(fd);
return 0;
}
//取0-54位 page frame number(PFN) if present
size_t frame = info & ((((uint64_t)1) << 55) - 1);
//乘以页大小,运行环境4096,即左移12位,可以用左移实现
size_t phy = frame * pagesize + addr % pagesize;
close(fd);
return phy;
}
int main()
{
char* str = malloc(128);
strcpy(str,"hello,world!");
printf("original, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
pid_t pid = fork();
if(pid < 0)
{
printf("fork() failed!\n");
return 1;
}
else if(pid > 0)
{
str[0] = 'H';
printf("father, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
wait(NULL);
}
else
{
printf("child, vir = %p, phy = %p, val = '%s'\n",
str, (void*)virtual_to_physical((size_t)str), str);
}
return 0;
}
运行结果:
[root workspace]#gcc -g -o fork fork.c
[root workspace]#./fork
original, vir = 0x5566be6e5260, phy = 0x49f85260, val = 'hello,world!'
father, vir = 0x5566be6e5260, phy = 0x47df8260, val = 'Hello,world!'
child, vir = 0x5566be6e5260, phy = 0x49f85260, val = 'hello,world!'
父进程中物理地址变了,而子进程将继承本来的物理地址,同样可以验证,子进程修改str,结论:
谁去修改内存,谁得到新的内存。
那么到底拷贝多大的内存呢?
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/wait.h>
void dump_pfns(char* addr, size_t len)
{
int fd = open("/proc/self/pagemap", O_RDONLY);
if(fd < 0)
{
printf("open '/proc/self/pagemap' failed!\n");
return;
}
size_t pagesize = getpagesize();
size_t offset = ((size_t)addr / pagesize) * sizeof(uint64_t);
if(lseek(fd, offset, SEEK_SET) < 0)
{
printf("lseek() failed!\n");
close(fd);
return;
}
size_t pages = (len - 1) / pagesize + 1;
uint64_t info[pages];
if(read(fd, info, sizeof(info)) != sizeof(info))
{
printf("read() failed!\n");
close(fd);
return;
}
close(fd);
for(size_t i = 0; i < pages; i++)
{
size_t frame = info[i] & ((((uint64_t)1) << 55) - 1);
if((info[i] & (((uint64_t)1) << 63)) == 0)
printf("page is not present!\n");
else
printf("page frame number: %lu\n", frame);
}
}
int main()
{
size_t len = 65536;
char* str = malloc(len);
memset(str, 0, len);
printf("original:\n");
dump_pfns(str, len);
pid_t pid = fork();
if(pid < 0)
{
printf("fork() failed!\n");
return 1;
}
else if(pid > 0)
{
printf("father:\n");
str[0]='H';
dump_pfns(str, len);
wait(0);
}
else
{
sleep(1);
printf("child:\n");
dump_pfns(str, len);
}
return 0;
}
运行结果:
[root workspace]#./fork
original:
page frame number: 204864
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858
father:
page frame number: 395474
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858
child:
page frame number: 204864
page frame number: 272949
page frame number: 229979
page frame number: 312365
page frame number: 204897
page frame number: 204580
page frame number: 380167
page frame number: 312364
page frame number: 317945
page frame number: 225223
page frame number: 228378
page frame number: 395369
page frame number: 204271
page frame number: 445753
page frame number: 383656
page frame number: 290858
可以看到上面只有一个页地址不同,由此可以得出结论,只需要复制已修改的部分所占的页数。