从源码看 postgresql 共享内存使用
共享内存实现方式
源码中可见Postgres 共享内存实现方式有 3 中,默认使用的 MMAP 内存映射方式
typedef enum
{
SHMEM_TYPE_WINDOWS,
SHMEM_TYPE_SYSV,
SHMEM_TYPE_MMAP,
} PGShmemType;
#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_MMAP
共享内存组成部分和大小
源码文件 backend/storage/ipc/ipci.c
源码函数 CalculateShmemSize(int *num_semaphores)
整体组成部分有下面这么多,其中最大的部分是BufferShmemSize,就是数据库参数shared_buffers 参数对应的部分。
size = add_size(size, PGSemaphoreShmemSize(numSemas));
size = add_size(size, SpinlockSemaSize());
size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
sizeof(ShmemIndexEnt)));
size = add_size(size, dsm_estimate_size());
size = add_size(size, DSMRegistryShmemSize());
size = add_size(size, BufferShmemSize());
size = add_size(size, LockShmemSize());
size = add_size(size, PredicateLockShmemSize());
size = add_size(size, ProcGlobalShmemSize());
size = add_size(size, XLogPrefetchShmemSize());
size = add_size(size, VarsupShmemSize());
size = add_size(size, XLOGShmemSize());
size = add_size(size, XLogRecoveryShmemSize());
size = add_size(size, CLOGShmemSize());
size = add_size(size, CommitTsShmemSize());
size = add_size(size, SUBTRANSShmemSize());
size = add_size(size, TwoPhaseShmemSize());
size = add_size(size, BackgroundWorkerShmemSize());
size = add_size(size, MultiXactShmemSize());
size = add_size(size, LWLockShmemSize());
size = add_size(size, ProcArrayShmemSize());
size = add_size(size, BackendStatusShmemSize());
size = add_size(size, SInvalShmemSize());
size = add_size(size, PMSignalShmemSize());
size = add_size(size, ProcSignalShmemSize());
size = add_size(size, CheckpointerShmemSize());
size = add_size(size, AutoVacuumShmemSize());
size = add_size(size, ReplicationSlotsShmemSize());
size = add_size(size, ReplicationOriginShmemSize());
size = add_size(size, WalSndShmemSize());
size = add_size(size, WalRcvShmemSize());
size = add_size(size, WalSummarizerShmemSize());
size = add_size(size, PgArchShmemSize());
size = add_size(size, ApplyLauncherShmemSize());
size = add_size(size, BTreeShmemSize());
size = add_size(size, SyncScanShmemSize());
size = add_size(size, AsyncShmemSize());
size = add_size(size, StatsShmemSize());
size = add_size(size, WaitEventExtensionShmemSize());
size = add_size(size, InjectionPointShmemSize());
size = add_size(size, SlotSyncShmemSize());
共享内存创建
在刚创建初期,共享内存结构很简单,如下图。
共享内存简单结构图
创建内存主体
函数:PGSharedMemoryCreate(Size size, PGShmemHeader **shim)
使用 mmap 方式创建共享内存
1、如果操作系统配置了 hugepage 则尝试使用 hugepage 分配内存
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
PG_MMAP_FLAGS | mmap_flags, -1, 0);
2、如果操作系统没有配置hugepage ,则用普通内存分配
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
PG_MMAP_FLAGS, -1, 0);
note:
mmap_flags 是前面函数获取的值,通常是 MAP_HUGETLB
#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) 使用匿名共享的方式 mmap
调用堆栈
#0 CreateAnonymousSegment (size=0x7fffffffd7b8) at pg_shmem.c:617
#1 0x000000000090b831 in PGSharedMemoryCreate (size=149848064, shim=0x7fffffffd8a8) at pg_shmem.c:739
#2 0x00000000009b3c6d in CreateSharedMemoryAndSemaphores () at ipci.c:215
#3 0x0000000000915b7f in PostmasterMain (argc=1, argv=0x1205780) at postmaster.c:977
一点想法
调式代码来看,系统运行到这里,共享内存已经分配完成了,但从操作系统 /proc/meminfo 中看不到实际使用值增加了。
其实刚启动的 postgres 不管 shared_buffer设置的多大,从操作系统都观察不到内存占用。这跟 linux 内核内存管理有关,分配的内存等到实际使用时才会占用,这样是蛮合理的。
为了验证这点,用一个简单的例子验证一下。
-
生成一个文件约 100Mb 的文件,待会将这个文件读入内存
dd if=/dev/zero of=test.file bs=1024000 count=100
-
使用 mmap模拟 postgres 分配内存的方式,分配 100mb 内存,单步调试执行代码,read 动作读入真实数据前后 ,/proc/meminfo 里关于Shmem 记录的变化,可以看到共享内存被真实使用的时候,操作系统会分配的相应的内存的。
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <errno.h>
int
main (int argc, char **argv)
{
int flag=O_RDONLY;
int fd=0;
int res=0;
char * ptr =NULL;
if((fd=open("/tmp/test.file",flag))==-1){
printf("open file error");
return -1;
};
ptr=mmap(NULL,104857600,PROT_READ|PROT_WRITE,MAP_SHARED|MAP_ANONYMOUS,-1,0);
res=read(fd,ptr,104857600);
printf("total read %d bytes\n",res);
return 0;
}
read 前
[postgres@ tmp]$ cat /proc/meminfo |grep Shm
Shmem: 138064 kB
read 后
[postgres@ tmp]$ cat /proc/meminfo |grep Shm
Shmem: 238064 kB
创建共享内存头部结构
PGShmemHeader结构体定义
typedef struct PGShmemHeader /* standard header for all Postgres shmem */
{
int32 magic; /* magic # to identify Postgres segments */
#define PGShmemMagic 679834894
pid_t creatorPID; /* PID of creating process (set but unread) */
Size totalsize; /* total size of segment */
Size freeoffset; /* offset to first free space */
dsm_handle dsm_control; /* ID of dynamic shared memory control seg */
void *index; /* pointer to ShmemIndex table */
#ifndef WIN32 /* Windows doesn't have useful inode#s */
dev_t device; /* device data directory is on */
ino_t inode; /* inode number of data directory */
#endif
} PGShmemHeader;
- 使用system v 方式创建内存头部信息,其中用$PGDATA 路径的 inode 作为 key
shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
//返回值memAddress 是创建共享内存的起始地址
操作系统可以看到这个共享内存
[postgres@edc-pdf-dmdb02 ~]$ ipcs -m|grep 56
0x0827c9e0 1736767 postgres 600 56 1
- 填充PGShmemHeader变量内容,包括 total size 等信息
hdr = (PGShmemHeader *) memAddress;
hdr->creatorPID = getpid();
hdr->magic = PGShmemMagic;
hdr->dsm_control = 0;
/* Fill in the data directory ID info, too */
hdr->device = statbuf.st_dev;
hdr->inode = statbuf.st_ino;
/*
* Initialize space allocation status for segment.
*/
hdr->totalsize = size;
- 复制PGShmemHeader到mmap 申请到的内存主里
memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
共享内存分配
创建信号量
-
postgres 每个后台进程和辅助进程都需要信号量
-
如果没有指明使用 posix 信号量,则使用 system v 信号量
函数名:PGReserveSemaphores(numSemas);
sharedSemas = (PGSemaphore) ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas));
-> PGSemaphoreShmemSize(maxSemas) 每个信号量 size 位 128,默认参数需要 128 个信号量,所以此处 size 16384 byte
创建共享内存分配机制
- spinlock 作为最基本的锁机制,需要先初始化
ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
从共享内存中分配空间给ShmemLock,真实分配的 size 为内存对齐后的 size ,size = MAXALIGN(size);最小单位是 8 byte
SpinLockInit(ShmemLock);
初始化 spinlock,赋值为 0
- 接下来需要一次内存对齐,代码里注释是下面这样的,从注释里看到,后面的内存分配需要调用ShmemAlloc,并且需要对 cache line 内存对齐,cache line size 是 128
/*
* Allocations after this point should go through ShmemAlloc, which
* expects to allocate everything on cache line boundaries. Make sure the
* first allocation begins on a cache line boundary.
*/
aligned = (char *)
(CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
shmhdr->freeoffset = aligned - (char *) shmhdr;
#define PG_CACHE_LINE_SIZE 128
初始化子系统内存
这步骤会创建很多模块的内存数据结构,且分配空间给他们,这里主要看一下占大部分空间的 buffer pool
CreateOrAttachShmemStructs();
->
粗略的看一下有几大块子系统
1 --> xlog, clog, and buffers
VarsupShmemInit();
XLOGShmemInit();
XLogPrefetchShmemInit();
XLogRecoveryShmemInit();
CLOGShmemInit();
CommitTsShmemInit();
SUBTRANSShmemInit();
MultiXactShmemInit();
InitBufferPool();
2 --> Set up process table
CreateSharedProcArray();
CreateSharedBackendStatus();
TwoPhaseShmemInit();
BackgroundWorkerShmemInit();
3 --> Set up interprocess signaling mechanisms
PMSignalShmemInit();
ProcSignalShmemInit();
CheckpointerShmemInit();
AutoVacuumShmemInit();
ReplicationSlotsShmemInit();
ReplicationOriginShmemInit();
WalSndShmemInit();
WalRcvShmemInit();
WalSummarizerShmemInit();
PgArchShmemInit();
ApplyLauncherShmemInit();
SlotSyncShmemInit();
等等
创建初始化BufferPool();
函数名 InitBufferPool();
分配 buffer pool 描述符
每一个 buffer pool page 都有一个单独的描述符,记录 buffer page 的 id 和 flag 等信息
BufferDescriptors = (BufferDescPadded *)
ShmemInitStruct("Buffer Descriptors",
NBuffers * sizeof(BufferDescPadded),
&foundDescs);
设定 buffer pool blcoks的内存位置
且对PG_IO_ALIGN_SIZE 内存对齐,4k 的大小,兼容大多数的磁盘扇区大小和物理 页大小
BufferBlocks = (char *)
TYPEALIGN(PG_IO_ALIGN_SIZE,
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
&foundBufs));
/*
* Assumed alignment requirement for direct I/O. 4K corresponds to common
* sector and memory page size.
*/
#define PG_IO_ALIGN_SIZE 4096
设定buffer pool 条件变量 起始地址
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
ShmemInitStruct("Buffer IO Condition Variables",
NBuffers * sizeof(ConditionVariableMinimallyPadded),
&foundIOCV);
设定CkptBufferIds变量地址
CkptBufferIds = (CkptSortItem *)
ShmemInitStruct("Checkpoint BufferIds",
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
初始化完 shared buffer 后的共享内存结构图