数据库的路上

从源码看 postgresql 共享内存使用

共享内存实现方式

源码中可见Postgres 共享内存实现方式有 3 中,默认使用的 MMAP 内存映射方式

typedef enum
{
	SHMEM_TYPE_WINDOWS,
	SHMEM_TYPE_SYSV,
	SHMEM_TYPE_MMAP,
}	PGShmemType;

#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_MMAP

共享内存组成部分和大小

源码文件 backend/storage/ipc/ipci.c
源码函数 CalculateShmemSize(int *num_semaphores)

整体组成部分有下面这么多,其中最大的部分是BufferShmemSize,就是数据库参数shared_buffers 参数对应的部分。

  size = add_size(size, PGSemaphoreShmemSize(numSemas));
	size = add_size(size, SpinlockSemaSize());
	size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
											 sizeof(ShmemIndexEnt)));
	size = add_size(size, dsm_estimate_size());
	size = add_size(size, DSMRegistryShmemSize());
	size = add_size(size, BufferShmemSize());
	size = add_size(size, LockShmemSize());
	size = add_size(size, PredicateLockShmemSize());
	size = add_size(size, ProcGlobalShmemSize());
	size = add_size(size, XLogPrefetchShmemSize());
	size = add_size(size, VarsupShmemSize());
	size = add_size(size, XLOGShmemSize());
	size = add_size(size, XLogRecoveryShmemSize());
	size = add_size(size, CLOGShmemSize());
	size = add_size(size, CommitTsShmemSize());
	size = add_size(size, SUBTRANSShmemSize());
	size = add_size(size, TwoPhaseShmemSize());
	size = add_size(size, BackgroundWorkerShmemSize());
	size = add_size(size, MultiXactShmemSize());
	size = add_size(size, LWLockShmemSize());
	size = add_size(size, ProcArrayShmemSize());
	size = add_size(size, BackendStatusShmemSize());
	size = add_size(size, SInvalShmemSize());
	size = add_size(size, PMSignalShmemSize());
	size = add_size(size, ProcSignalShmemSize());
	size = add_size(size, CheckpointerShmemSize());
	size = add_size(size, AutoVacuumShmemSize());
	size = add_size(size, ReplicationSlotsShmemSize());
	size = add_size(size, ReplicationOriginShmemSize());
	size = add_size(size, WalSndShmemSize());
	size = add_size(size, WalRcvShmemSize());
	size = add_size(size, WalSummarizerShmemSize());
	size = add_size(size, PgArchShmemSize());
	size = add_size(size, ApplyLauncherShmemSize());
	size = add_size(size, BTreeShmemSize());
	size = add_size(size, SyncScanShmemSize());
	size = add_size(size, AsyncShmemSize());
	size = add_size(size, StatsShmemSize());
	size = add_size(size, WaitEventExtensionShmemSize());
	size = add_size(size, InjectionPointShmemSize());
	size = add_size(size, SlotSyncShmemSize());

共享内存创建

在刚创建初期,共享内存结构很简单,如下图。

​ 共享内存简单结构图

创建内存主体

函数:PGSharedMemoryCreate(Size size, PGShmemHeader **shim)

使用 mmap 方式创建共享内存

1、如果操作系统配置了 hugepage 则尝试使用 hugepage 分配内存
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
				   PG_MMAP_FLAGS | mmap_flags, -1, 0);  



2、如果操作系统没有配置hugepage ,则用普通内存分配
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
				   PG_MMAP_FLAGS, -1, 0);
	
  
note: 
mmap_flags 是前面函数获取的值,通常是 MAP_HUGETLB
#define PG_MMAP_FLAGS			(MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)   使用匿名共享的方式 mmap

调用堆栈

#0  CreateAnonymousSegment (size=0x7fffffffd7b8) at pg_shmem.c:617
#1  0x000000000090b831 in PGSharedMemoryCreate (size=149848064, shim=0x7fffffffd8a8) at pg_shmem.c:739
#2  0x00000000009b3c6d in CreateSharedMemoryAndSemaphores () at ipci.c:215
#3  0x0000000000915b7f in PostmasterMain (argc=1, argv=0x1205780) at postmaster.c:977

一点想法

调式代码来看,系统运行到这里,共享内存已经分配完成了,但从操作系统 /proc/meminfo 中看不到实际使用值增加了。

其实刚启动的 postgres 不管 shared_buffer设置的多大,从操作系统都观察不到内存占用。这跟 linux 内核内存管理有关,分配的内存等到实际使用时才会占用,这样是蛮合理的。

为了验证这点,用一个简单的例子验证一下。

  1. 生成一个文件约 100Mb 的文件,待会将这个文件读入内存

    dd if=/dev/zero of=test.file bs=1024000 count=100
    
  2. 使用 mmap模拟 postgres 分配内存的方式,分配 100mb 内存,单步调试执行代码,read 动作读入真实数据前后 ,/proc/meminfo 里关于Shmem 记录的变化,可以看到共享内存被真实使用的时候,操作系统会分配的相应的内存的。

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <errno.h>
int
main (int argc, char **argv)
{
     int flag=O_RDONLY;
     int fd=0;
     int res=0;
     char * ptr =NULL;

     if((fd=open("/tmp/test.file",flag))==-1){
         printf("open file error");
         return -1;
     };
     ptr=mmap(NULL,104857600,PROT_READ|PROT_WRITE,MAP_SHARED|MAP_ANONYMOUS,-1,0);
    
     res=read(fd,ptr,104857600);
     
     printf("total read %d bytes\n",res);
     
     return 0;
}
read 前
[postgres@ tmp]$ cat /proc/meminfo |grep Shm
Shmem:            138064 kB

read 后

[postgres@ tmp]$ cat /proc/meminfo |grep Shm
Shmem:            238064 kB

创建共享内存头部结构

PGShmemHeader结构体定义

typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
{
	int32		magic;			/* magic # to identify Postgres segments */
#define PGShmemMagic  679834894
	pid_t		creatorPID;		/* PID of creating process (set but unread) */
	Size		totalsize;		/* total size of segment */
	Size		freeoffset;		/* offset to first free space */
	dsm_handle	dsm_control;	/* ID of dynamic shared memory control seg */
	void	   *index;			/* pointer to ShmemIndex table */
#ifndef WIN32					/* Windows doesn't have useful inode#s */
	dev_t		device;			/* device data directory is on */
	ino_t		inode;			/* inode number of data directory */
#endif
} PGShmemHeader;
  1. 使用system v 方式创建内存头部信息,其中用$PGDATA 路径的 inode 作为 key
 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
 memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
 
 //返回值memAddress 是创建共享内存的起始地址

操作系统可以看到这个共享内存

[postgres@edc-pdf-dmdb02 ~]$ ipcs -m|grep 56
0x0827c9e0 1736767    postgres   600        56         1  
  1. 填充PGShmemHeader变量内容,包括 total size 等信息
  hdr = (PGShmemHeader *) memAddress;
  hdr->creatorPID = getpid();
  hdr->magic = PGShmemMagic;
  hdr->dsm_control = 0;
  /* Fill in the data directory ID info, too */
  hdr->device = statbuf.st_dev;
  hdr->inode = statbuf.st_ino;
  /*
   * Initialize space allocation status for segment.
   */
  hdr->totalsize = size;
  1. 复制PGShmemHeader到mmap 申请到的内存主里
memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));

共享内存分配

创建信号量

  • postgres 每个后台进程和辅助进程都需要信号量

  • 如果没有指明使用 posix 信号量,则使用 system v 信号量

函数名:PGReserveSemaphores(numSemas);

sharedSemas = (PGSemaphore) ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas));
							
					-> PGSemaphoreShmemSize(maxSemas) 每个信号量 size 位 128,默认参数需要 128 个信号量,所以此处 size 16384 byte
					

image-20240823195412009

创建共享内存分配机制

  1. spinlock 作为最基本的锁机制,需要先初始化
ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));

从共享内存中分配空间给ShmemLock,真实分配的 size 为内存对齐后的 size ,size = MAXALIGN(size);最小单位是 8 byte

SpinLockInit(ShmemLock);

初始化 spinlock,赋值为 0
  1. 接下来需要一次内存对齐,代码里注释是下面这样的,从注释里看到,后面的内存分配需要调用ShmemAlloc,并且需要对 cache line 内存对齐,cache line size 是 128
   	/*
  	 * Allocations after this point should go through ShmemAlloc, which
  	 * expects to allocate everything on cache line boundaries.  Make sure the
  	 * first allocation begins on a cache line boundary.
  	 */
  	aligned = (char *)
		(CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
	  shmhdr->freeoffset = aligned - (char *) shmhdr;
#define PG_CACHE_LINE_SIZE		128

初始化子系统内存

这步骤会创建很多模块的内存数据结构,且分配空间给他们,这里主要看一下占大部分空间的 buffer pool

CreateOrAttachShmemStructs();
->
 粗略的看一下有几大块子系统 
 
1 --> xlog, clog, and buffers
 	VarsupShmemInit();
	XLOGShmemInit();
	XLogPrefetchShmemInit();
	XLogRecoveryShmemInit();
	CLOGShmemInit();
	CommitTsShmemInit();
	SUBTRANSShmemInit();
	MultiXactShmemInit();
	InitBufferPool();
2 --> Set up process table
	CreateSharedProcArray();
	CreateSharedBackendStatus();
	TwoPhaseShmemInit();
	BackgroundWorkerShmemInit();

3 --> Set up interprocess signaling mechanisms
	PMSignalShmemInit();
	ProcSignalShmemInit();
	CheckpointerShmemInit();
	AutoVacuumShmemInit();
	ReplicationSlotsShmemInit();
	ReplicationOriginShmemInit();
	WalSndShmemInit();
	WalRcvShmemInit();
	WalSummarizerShmemInit();
	PgArchShmemInit();
	ApplyLauncherShmemInit();
	SlotSyncShmemInit();
等等

创建初始化BufferPool();

函数名 	InitBufferPool();
分配 buffer pool 描述符

每一个 buffer pool page 都有一个单独的描述符,记录 buffer page 的 id 和 flag 等信息

	BufferDescriptors = (BufferDescPadded *)
		ShmemInitStruct("Buffer Descriptors",
						NBuffers * sizeof(BufferDescPadded),
						&foundDescs);
设定 buffer pool blcoks的内存位置

且对PG_IO_ALIGN_SIZE 内存对齐,4k 的大小,兼容大多数的磁盘扇区大小和物理 页大小

	BufferBlocks = (char *)
		TYPEALIGN(PG_IO_ALIGN_SIZE,
				  ShmemInitStruct("Buffer Blocks",
								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
								  &foundBufs));

/*
 * Assumed alignment requirement for direct I/O.  4K corresponds to common
 * sector and memory page size.
 */
#define PG_IO_ALIGN_SIZE		4096
设定buffer pool 条件变量 起始地址
	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
		ShmemInitStruct("Buffer IO Condition Variables",
						NBuffers * sizeof(ConditionVariableMinimallyPadded),
						&foundIOCV);
设定CkptBufferIds变量地址
	CkptBufferIds = (CkptSortItem *)
		ShmemInitStruct("Checkpoint BufferIds",
						NBuffers * sizeof(CkptSortItem), &foundBufCkpt);

​ 初始化完 shared buffer 后的共享内存结构图