summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnatoly Burakov <anatoly.burakov@intel.com>2018-04-11 13:30:24 +0100
committerThomas Monjalon <thomas@monjalon.net>2018-04-11 19:55:39 +0200
commit66cc45e293ed67d4e83fb8a1174089c58610a8e1 (patch)
tree5e0f069e4d209b1b518922c49f2b707fdf106690
parentc44d09811b40815d0b6d4b297f7709c741197774 (diff)
downloaddpdk-66cc45e293ed.zip
dpdk-66cc45e293ed.tar.gz
dpdk-66cc45e293ed.tar.xz
mem: replace memseg with memseg lists
Before, we were aggregating multiple pages into one memseg, so the number of memsegs was small. Now, each page gets its own memseg, so the list of memsegs is huge. To accommodate the new memseg list size and to keep the under-the-hood workings sane, the memseg list is now not just a single list, but multiple lists. To be precise, each hugepage size available on the system gets one or more memseg lists, per socket. In order to support dynamic memory allocation, we reserve all memory in advance (unless we're in 32-bit legacy mode, in which case we do not preallocate memory). As in, we do an anonymous mmap() of the entire maximum size of memory per hugepage size, per socket (which is limited to either RTE_MAX_MEMSEG_PER_TYPE pages or RTE_MAX_MEM_MB_PER_TYPE megabytes worth of memory, whichever is the smaller one), split over multiple lists (which are limited to either RTE_MAX_MEMSEG_PER_LIST memsegs or RTE_MAX_MEM_MB_PER_LIST megabytes per list, whichever is the smaller one). There is also a global limit of CONFIG_RTE_MAX_MEM_MB megabytes, which is mainly used for 32-bit targets to limit amounts of preallocated memory, but can be used to place an upper limit on total amount of VA memory that can be allocated by DPDK application. So, for each hugepage size, we get (by default) up to 128G worth of memory, per socket, split into chunks of up to 32G in size. The address space is claimed at the start, in eal_common_memory.c. The actual page allocation code is in eal_memalloc.c (Linux-only), and largely consists of copied EAL memory init code. Pages in the list are also indexed by address. That is, in order to figure out where the page belongs, one can simply look at base address for a memseg list. Similarly, figuring out IOVA address of a memzone is a matter of finding the right memseg list, getting offset and dividing by page size to get the appropriate memseg. This commit also removes rte_eal_dump_physmem_layout() call, according to deprecation notice [1], and removes that deprecation notice as well. On 32-bit targets due to limited VA space, DPDK will no longer spread memory to different sockets like before. Instead, it will (by default) allocate all of the memory on socket where master lcore is. To override this behavior, --socket-mem must be used. The rest of the changes are really ripple effects from the memseg change - heap changes, compile fixes, and rewrites to support fbarray-backed memseg lists. Due to earlier switch to _walk() functions, most of the changes are simple fixes, however some of the _walk() calls were switched to memseg list walk, where it made sense to do so. Additionally, we are also switching locks from flock() to fcntl(). Down the line, we will be introducing single-file segments option, and we cannot use flock() locks to lock parts of the file. Therefore, we will use fcntl() locks for legacy mem as well, in case someone is unfortunate enough to accidentally start legacy mem primary process alongside an already working non-legacy mem-based primary process. [1] http://dpdk.org/dev/patchwork/patch/34002/ Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com> Tested-by: Santosh Shukla <santosh.shukla@caviumnetworks.com> Tested-by: Hemant Agrawal <hemant.agrawal@nxp.com> Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
-rw-r--r--config/common_base15
-rw-r--r--config/defconfig_i686-native-linuxapp-gcc3
-rw-r--r--config/defconfig_i686-native-linuxapp-icc3
-rw-r--r--config/defconfig_x86_x32-native-linuxapp-gcc3
-rw-r--r--config/rte_config.h7
-rw-r--r--doc/guides/rel_notes/deprecation.rst9
-rw-r--r--drivers/bus/fslmc/fslmc_vfio.c10
-rw-r--r--drivers/bus/fslmc/portal/dpaa2_hw_pvt.h2
-rw-r--r--drivers/bus/pci/linux/pci.c8
-rw-r--r--drivers/crypto/dpaa_sec/dpaa_sec.c2
-rw-r--r--drivers/net/mlx4/mlx4_mr.c4
-rw-r--r--drivers/net/mlx5/mlx5.c3
-rw-r--r--drivers/net/mlx5/mlx5_mr.c4
-rw-r--r--drivers/net/virtio/virtio_user/vhost_kernel.c4
-rw-r--r--lib/librte_eal/bsdapp/eal/eal.c12
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_hugepage_info.c17
-rw-r--r--lib/librte_eal/bsdapp/eal/eal_memory.c209
-rw-r--r--lib/librte_eal/common/eal_common_memory.c603
-rw-r--r--lib/librte_eal/common/eal_common_memzone.c48
-rw-r--r--lib/librte_eal/common/eal_hugepages.h1
-rw-r--r--lib/librte_eal/common/eal_internal_cfg.h2
-rw-r--r--lib/librte_eal/common/include/rte_eal_memconfig.h22
-rw-r--r--lib/librte_eal/common/include/rte_memory.h59
-rw-r--r--lib/librte_eal/common/include/rte_memzone.h1
-rw-r--r--lib/librte_eal/common/malloc_elem.c12
-rw-r--r--lib/librte_eal/common/malloc_elem.h6
-rw-r--r--lib/librte_eal/common/malloc_heap.c62
-rw-r--r--lib/librte_eal/common/rte_malloc.c22
-rw-r--r--lib/librte_eal/linuxapp/eal/eal.c15
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_hugepage_info.c25
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_memory.c914
-rw-r--r--lib/librte_eal/linuxapp/eal/eal_vfio.c9
-rw-r--r--lib/librte_eal/rte_eal_version.map3
-rw-r--r--lib/librte_mempool/rte_mempool.c9
-rw-r--r--test/test/test_malloc.c30
-rw-r--r--test/test/test_memory.c10
-rw-r--r--test/test/test_memzone.c12
37 files changed, 1590 insertions, 590 deletions
diff --git a/config/common_base b/config/common_base
index c09c7cf..f557e6b 100644
--- a/config/common_base
+++ b/config/common_base
@@ -61,7 +61,20 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=64
+# each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
+# or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
+CONFIG_RTE_MAX_MEM_MB_PER_LIST=32768
+# a "type" is a combination of page size and NUMA node. total number of memseg
+# lists per type will be limited to either RTE_MAX_MEMSEG_PER_TYPE pages (split
+# over multiple lists of RTE_MAX_MEMSEG_PER_LIST pages), or
+# RTE_MAX_MEM_MB_PER_TYPE megabytes of memory (split over multiple lists of
+# RTE_MAX_MEM_MB_PER_LIST), whichever is smaller
+CONFIG_RTE_MAX_MEMSEG_PER_TYPE=32768
+CONFIG_RTE_MAX_MEM_MB_PER_TYPE=131072
+# global maximum usable amount of VA, in megabytes
+CONFIG_RTE_MAX_MEM_MB=524288
CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n
diff --git a/config/defconfig_i686-native-linuxapp-gcc b/config/defconfig_i686-native-linuxapp-gcc
index a42ba4f..1178fe3 100644
--- a/config/defconfig_i686-native-linuxapp-gcc
+++ b/config/defconfig_i686-native-linuxapp-gcc
@@ -46,3 +46,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/defconfig_i686-native-linuxapp-icc b/config/defconfig_i686-native-linuxapp-icc
index 144ba0a..f096e22 100644
--- a/config/defconfig_i686-native-linuxapp-icc
+++ b/config/defconfig_i686-native-linuxapp-icc
@@ -51,3 +51,6 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/defconfig_x86_x32-native-linuxapp-gcc b/config/defconfig_x86_x32-native-linuxapp-gcc
index b6206a5..57d000d 100644
--- a/config/defconfig_x86_x32-native-linuxapp-gcc
+++ b/config/defconfig_x86_x32-native-linuxapp-gcc
@@ -26,3 +26,6 @@ CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
# AVP PMD is not supported on 32-bit
#
CONFIG_RTE_LIBRTE_AVP_PMD=n
+
+# 32-bit doesn't break up memory in lists, but does have VA allocation limit
+CONFIG_RTE_MAX_MEM_MB=2048
diff --git a/config/rte_config.h b/config/rte_config.h
index db6ceb6..f293d9e 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -21,7 +21,12 @@
/****** library defines ********/
/* EAL defines */
-#define RTE_MAX_MEMSEG 512
+#define RTE_MAX_MEMSEG_LISTS 128
+#define RTE_MAX_MEMSEG_PER_LIST 8192
+#define RTE_MAX_MEM_MB_PER_LIST 32768
+#define RTE_MAX_MEMSEG_PER_TYPE 32768
+#define RTE_MAX_MEM_MB_PER_TYPE 65536
+#define RTE_MAX_MEM_MB 524288
#define RTE_MAX_MEMZONE 2560
#define RTE_MAX_TAILQ 32
#define RTE_LOG_DP_LEVEL RTE_LOG_INFO
diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index ec70b5f..c9f2703 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -38,15 +38,6 @@ Deprecation Notices
success and failure, respectively. This will change to 1 and 0 for true and
false, respectively, to make use of the function more intuitive.
-* eal: due to internal data layout reorganization, there will be changes to
- several structures and functions as a result of coming changes to support
- memory hotplug in v18.05.
- ``rte_eal_get_physmem_layout`` will be deprecated and removed in subsequent
- releases.
- ``rte_mem_config`` contents will change due to switch to memseg lists.
- ``rte_memzone`` member ``memseg_id`` will no longer serve any useful purpose
- and will be removed.
-
* eal: a new set of mbuf mempool ops name APIs for user, platform and best
mempool names have been defined in ``rte_mbuf`` in v18.02. The uses of
``rte_eal_mbuf_default_mempool_ops`` shall be replaced by
diff --git a/drivers/bus/fslmc/fslmc_vfio.c b/drivers/bus/fslmc/fslmc_vfio.c
index 0c048dc..8b15312 100644
--- a/drivers/bus/fslmc/fslmc_vfio.c
+++ b/drivers/bus/fslmc/fslmc_vfio.c
@@ -190,7 +190,8 @@ static int vfio_map_irq_region(struct fslmc_vfio_group *group)
}
static int
-fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
+fslmc_vfio_map(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *n_segs = arg;
struct fslmc_vfio_group *group;
@@ -232,18 +233,11 @@ fslmc_vfio_map(const struct rte_memseg *ms, void *arg)
int rte_fslmc_vfio_dmamap(void)
{
- const struct rte_memseg *memseg;
int i = 0;
if (is_dma_done)
return 0;
- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- DPAA2_BUS_ERR("Cannot get physical layout");
- return -ENODEV;
- }
-
if (rte_memseg_walk(fslmc_vfio_map, &i) < 0)
return -1;
diff --git a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
index 45fd41e..72aae43 100644
--- a/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
+++ b/drivers/bus/fslmc/portal/dpaa2_hw_pvt.h
@@ -274,7 +274,7 @@ static phys_addr_t dpaa2_mem_vtop(uint64_t vaddr)
if (dpaa2_virt_mode)
return vaddr;
- memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr);
+ memseg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
if (memseg)
return memseg->phys_addr + RTE_PTR_DIFF(vaddr, memseg->addr);
return (size_t)NULL;
diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index 6dda054..4630a80 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -117,9 +117,10 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
}
static int
-find_max_end_va(const struct rte_memseg *ms, void *arg)
+find_max_end_va(const struct rte_memseg_list *msl, void *arg)
{
- void *end_va = RTE_PTR_ADD(ms->addr, ms->len);
+ size_t sz = msl->memseg_arr.len * msl->page_sz;
+ void *end_va = RTE_PTR_ADD(msl->base_va, sz);
void **max_va = arg;
if (*max_va < end_va)
@@ -132,10 +133,11 @@ pci_find_max_end_va(void)
{
void *va = NULL;
- rte_memseg_walk(find_max_end_va, &va);
+ rte_memseg_list_walk(find_max_end_va, &va);
return va;
}
+
/* parse one line of the "resource" sysfs file (note that the 'line'
* string is modified)
*/
diff --git a/drivers/crypto/dpaa_sec/dpaa_sec.c b/drivers/crypto/dpaa_sec/dpaa_sec.c
index a14e669..b685220 100644
--- a/drivers/crypto/dpaa_sec/dpaa_sec.c
+++ b/drivers/crypto/dpaa_sec/dpaa_sec.c
@@ -95,7 +95,7 @@ dpaa_mem_vtop(void *vaddr)
{
const struct rte_memseg *ms;
- ms = rte_mem_virt2memseg(vaddr);
+ ms = rte_mem_virt2memseg(vaddr, NULL);
if (ms)
return ms->iova + RTE_PTR_DIFF(vaddr, ms->addr);
return (size_t)NULL;
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index b7e910d..e69b433 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -141,10 +141,10 @@ mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
(void *)mp, (void *)start, (void *)end,
(size_t)(end - start));
/* Round start and end to page boundary if found in memory segments. */
- ms = rte_mem_virt2memseg((void *)start);
+ ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
- ms = rte_mem_virt2memseg((void *)end);
+ ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 00c2c86..369ea45 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -478,7 +478,8 @@ static struct rte_pci_driver mlx5_driver;
static void *uar_base;
static int
-find_lower_va_bound(const struct rte_memseg *ms, void *arg)
+find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
void **addr = arg;
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index c96e134..fdf7b3e 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -262,10 +262,10 @@ mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
mr->end = end;
/* Round start and end to page boundary if found in memory segments. */
- ms = rte_mem_virt2memseg((void *)start);
+ ms = rte_mem_virt2memseg((void *)start, NULL);
if (ms != NULL)
start = RTE_ALIGN_FLOOR(start, ms->hugepage_sz);
- ms = rte_mem_virt2memseg((void *)end);
+ ms = rte_mem_virt2memseg((void *)end, NULL);
if (ms != NULL)
end = RTE_ALIGN_CEIL(end, ms->hugepage_sz);
diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
index 93d7efe..b244409 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -75,7 +75,8 @@ struct walk_arg {
uint32_t region_nr;
};
static int
-add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
+add_memory_region(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, size_t len, void *arg)
{
struct walk_arg *wa = arg;
struct vhost_memory_region *mr;
@@ -95,7 +96,6 @@ add_memory_region(const struct rte_memseg *ms, size_t len, void *arg)
return 0;
}
-
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index f44b904..d009cf0 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -64,8 +64,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};
/* Address of global and public configuration */
@@ -430,11 +430,11 @@ out:
}
static int
-check_socket(const struct rte_memseg *ms, void *arg)
+check_socket(const struct rte_memseg_list *msl, void *arg)
{
int *socket_id = arg;
- if (ms->socket_id == *socket_id)
+ if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
return 1;
return 0;
@@ -447,10 +447,11 @@ eal_check_mem_on_local_socket(void)
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
- if (rte_memseg_walk(check_socket, &socket_id) == 0)
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}
+
static int
sync_func(__attribute__((unused)) void *arg)
{
@@ -561,7 +562,6 @@ rte_eal_init(int argc, char **argv)
rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
if (internal_config.no_hugetlbfs == 0 &&
- internal_config.process_type != RTE_PROC_SECONDARY &&
eal_hugepage_info_init() < 0) {
rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES;
diff --git a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
index be2dbf0..ba44da0 100644
--- a/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/bsdapp/eal/eal_hugepage_info.c
@@ -47,12 +47,18 @@ eal_hugepage_info_init(void)
struct hugepage_info *hpi = &internal_config.hugepage_info[0];
struct hugepage_info *tmp_hpi;
+ internal_config.num_hugepage_sizes = 1;
+
+ /* nothing more to be done for secondary */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return 0;
+
sysctl_size = sizeof(num_buffers);
error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
&sysctl_size, NULL, 0);
if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers");
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n");
return -1;
}
@@ -61,7 +67,7 @@ eal_hugepage_info_init(void)
&sysctl_size, NULL, 0);
if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size");
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n");
return -1;
}
@@ -81,22 +87,21 @@ eal_hugepage_info_init(void)
RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
num_buffers, (int)(buffer_size>>10));
- internal_config.num_hugepage_sizes = 1;
hpi->hugedir = CONTIGMEM_DEV;
hpi->hugepage_sz = buffer_size;
hpi->num_pages[0] = num_buffers;
hpi->lock_descriptor = fd;
tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
- sizeof(struct hugepage_info));
+ sizeof(internal_config.hugepage_info));
if (tmp_hpi == NULL ) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
return -1;
}
- memcpy(tmp_hpi, hpi, sizeof(struct hugepage_info));
+ memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
- if ( munmap(tmp_hpi, sizeof(struct hugepage_info)) < 0) {
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
return -1;
}
diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index bdfb882..2f5651d 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -6,6 +6,8 @@
#include <sys/types.h>
#include <sys/sysctl.h>
#include <inttypes.h>
+#include <errno.h>
+#include <string.h>
#include <fcntl.h>
#include <rte_eal.h>
@@ -41,37 +43,135 @@ rte_eal_hugepage_init(void)
struct rte_mem_config *mcfg;
uint64_t total_mem = 0;
void *addr;
- unsigned i, j, seg_idx = 0;
+ unsigned int i, j, seg_idx = 0;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
/* for debug purposes, hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
- addr = malloc(internal_config.memory);
- mcfg->memseg[0].iova = (rte_iova_t)(uintptr_t)addr;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
+ uint64_t page_sz;
+ int n_segs, cur_seg;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ addr = mmap(NULL, internal_config.memory,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+
+ /* populate memsegs. each memseg is 1 page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &msl->memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = page_sz;
+ ms->socket_id = 0;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, page_sz);
+ }
return 0;
}
/* map all hugepages and sort them */
for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
struct hugepage_info *hpi;
+ uint64_t page_sz, mem_needed;
+ unsigned int n_pages, max_pages;
hpi = &internal_config.hugepage_info[i];
- for (j = 0; j < hpi->num_pages[0]; j++) {
+ page_sz = hpi->hugepage_sz;
+ max_pages = hpi->num_pages[0];
+ mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
+ page_sz);
+
+ n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
+
+ for (j = 0; j < n_pages; j++) {
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
struct rte_memseg *seg;
+ int msl_idx, ms_idx;
rte_iova_t physaddr;
int error;
size_t sysctl_size = sizeof(physaddr);
char physaddr_str[64];
- addr = mmap(NULL, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
- MAP_SHARED, hpi->lock_descriptor,
- j * EAL_PAGE_SIZE);
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ bool empty;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+
+ empty = arr->count == 0;
+
+ /* we need 1, plus hole if not empty */
+ ms_idx = rte_fbarray_find_next_n_free(arr,
+ 0, 1 + (empty ? 1 : 0));
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ /* leave some space between memsegs, they are
+ * not IOVA contiguous, so they shouldn't be VA
+ * contiguous either.
+ */
+ if (!empty)
+ ms_idx++;
+
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+ seg = rte_fbarray_get(arr, ms_idx);
+
+ addr = RTE_PTR_ADD(msl->base_va,
+ (size_t)msl->page_sz * ms_idx);
+
+ /* address is already mapped in memseg list, so using
+ * MAP_FIXED here is safe.
+ */
+ addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
+ MAP_SHARED | MAP_FIXED,
+ hpi->lock_descriptor,
+ j * EAL_PAGE_SIZE);
if (addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
j, hpi->hugedir);
@@ -88,33 +188,62 @@ rte_eal_hugepage_init(void)
return -1;
}
- seg = &mcfg->memseg[seg_idx++];
seg->addr = addr;
seg->iova = physaddr;
- seg->hugepage_sz = hpi->hugepage_sz;
- seg->len = hpi->hugepage_sz;
+ seg->hugepage_sz = page_sz;
+ seg->len = page_sz;
seg->nchannel = mcfg->nchannel;
seg->nrank = mcfg->nrank;
seg->socket_id = 0;
+ rte_fbarray_set_used(arr, ms_idx);
+
RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
PRIx64", len %zu\n",
- seg_idx, addr, physaddr, hpi->hugepage_sz);
- if (total_mem >= internal_config.memory ||
- seg_idx >= RTE_MAX_MEMSEG)
- break;
+ seg_idx, addr, physaddr, page_sz);
+
+ total_mem += seg->len;
}
+ if (total_mem >= internal_config.memory)
+ break;
+ }
+ if (total_mem < internal_config.memory) {
+ RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
+ "requested: %" PRIu64 "M "
+ "available: %" PRIu64 "M\n",
+ internal_config.memory >> 20, total_mem >> 20);
+ return -1;
}
return 0;
}
+struct attach_walk_args {
+ int fd_hugepage;
+ int seg_idx;
+};
+static int
+attach_segment(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
+{
+ struct attach_walk_args *wa = arg;
+ void *addr;
+
+ addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
+ wa->seg_idx * EAL_PAGE_SIZE);
+ if (addr == MAP_FAILED || addr != ms->addr)
+ return -1;
+ wa->seg_idx++;
+
+ return 0;
+}
+
int
rte_eal_hugepage_attach(void)
{
const struct hugepage_info *hpi;
int fd_hugepage_info, fd_hugepage = -1;
- unsigned i = 0;
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int i;
/* Obtain a file descriptor for hugepage_info */
fd_hugepage_info = open(eal_hugepage_info_path(), O_RDONLY);
@@ -124,41 +253,43 @@ rte_eal_hugepage_attach(void)
}
/* Map the shared hugepage_info into the process address spaces */
- hpi = mmap(NULL, sizeof(struct hugepage_info), PROT_READ, MAP_PRIVATE,
- fd_hugepage_info, 0);
+ hpi = mmap(NULL, sizeof(internal_config.hugepage_info),
+ PROT_READ, MAP_PRIVATE, fd_hugepage_info, 0);
if (hpi == MAP_FAILED) {
RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
goto error;
}
- /* Obtain a file descriptor for contiguous memory */
- fd_hugepage = open(hpi->hugedir, O_RDWR);
- if (fd_hugepage < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n", hpi->hugedir);
- goto error;
- }
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ const struct hugepage_info *cur_hpi = &hpi[i];
+ struct attach_walk_args wa;
- /* Map the contiguous memory into each memory segment */
- for (i = 0; i < hpi->num_pages[0]; i++) {
+ memset(&wa, 0, sizeof(wa));
- void *addr;
- struct rte_memseg *seg = &mcfg->memseg[i];
+ /* Obtain a file descriptor for contiguous memory */
+ fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
+ if (fd_hugepage < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ cur_hpi->hugedir);
+ goto error;
+ }
+ wa.fd_hugepage = fd_hugepage;
+ wa.seg_idx = 0;
- addr = mmap(seg->addr, hpi->hugepage_sz, PROT_READ|PROT_WRITE,
- MAP_SHARED|MAP_FIXED, fd_hugepage,
- i * EAL_PAGE_SIZE);
- if (addr == MAP_FAILED || addr != seg->addr) {
+ /* Map the contiguous memory into each memory segment */
+ if (rte_memseg_walk(attach_segment, &wa) < 0) {
RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
- i, hpi->hugedir);
+ wa.seg_idx, cur_hpi->hugedir);
goto error;
}
+ close(fd_hugepage);
+ fd_hugepage = -1;
}
/* hugepage_info is no longer required */
- munmap((void *)(uintptr_t)hpi, sizeof(struct hugepage_info));
+ munmap((void *)(uintptr_t)hpi, sizeof(internal_config.hugepage_info));
close(fd_hugepage_info);
- close(fd_hugepage);
return 0;
error:
diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
index fd78d2f..d519f15 100644
--- a/lib/librte_eal/common/eal_common_memory.c
+++ b/lib/librte_eal/common/eal_common_memory.c
@@ -13,6 +13,7 @@
#include <sys/mman.h>
#include <sys/queue.h>
+#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
@@ -30,6 +31,8 @@
* which is a multiple of hugepage size.
*/
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+
static uint64_t baseaddr_offset;
static uint64_t system_page_sz;
@@ -120,15 +123,394 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return aligned_addr;
}
-/*
- * Return a pointer to a read-only table of struct rte_physmem_desc
- * elements, containing the layout of all addressable physical
- * memory. The last element of the table contains a NULL address.
- */
-const struct rte_memseg *
-rte_eal_get_physmem_layout(void)
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+ uint64_t area_sz, max_pages;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+ max_pages = RTE_MAX_MEMSEG_PER_LIST;
+ max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+ area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return RTE_ALIGN(area_sz, page_sz);
+}
+
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ uint64_t max_mem, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+ uint64_t mem_amount;
+ int max_segs;
+
+ mem_amount = get_mem_amount(page_sz, max_mem);
+ max_segs = mem_amount / page_sz;
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, max_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+static int __rte_unused
+memseg_primary_init_32(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int active_sockets, hpi_idx, msl_idx = 0;
+ unsigned int socket_id, i;
+ struct rte_memseg_list *msl;
+ uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
+ uint64_t max_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /* this is a giant hack, but desperate times call for desperate
+ * measures. in legacy 32-bit mode, we cannot preallocate VA space,
+ * because having upwards of 2 gigabytes of VA space already mapped will
+ * interfere with our ability to map and sort hugepages.
+ *
+ * therefore, in legacy 32-bit mode, we will be initializing memseg
+ * lists much later - in eal_memory.c, right after we unmap all the
+ * unneeded pages. this will not affect secondary processes, as those
+ * should be able to mmap the space without (too many) problems.
+ */
+ if (internal_config.legacy_mem)
+ return 0;
+
+ /* 32-bit mode is a very special case. we cannot know in advance where
+ * the user will want to allocate their memory, so we have to do some
+ * heuristics.
+ */
+ active_sockets = 0;
+ total_requested_mem = 0;
+ if (internal_config.force_sockets)
+ for (i = 0; i < rte_socket_count(); i++) {
+ uint64_t mem;
+
+ socket_id = rte_socket_id_by_idx(i);
+ mem = internal_config.socket_mem[socket_id];
+
+ if (mem == 0)
+ continue;
+
+ active_sockets++;
+ total_requested_mem += mem;
+ }
+ else
+ total_requested_mem = internal_config.memory;
+
+ max_mem = (uint64_t) RTE_MAX_MEM_MB_PER_TYPE << 20;
+ if (total_requested_mem > max_mem) {
+ RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
+ (unsigned int)(max_mem >> 20));
+ return -1;
+ }
+ total_extra_mem = max_mem - total_requested_mem;
+ extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
+ total_extra_mem / active_sockets;
+
+ /* the allocation logic is a little bit convoluted, but here's how it
+ * works, in a nutshell:
+ * - if user hasn't specified on which sockets to allocate memory via
+ * --socket-mem, we allocate all of our memory on master core socket.
+ * - if user has specified sockets to allocate memory on, there may be
+ * some "unused" memory left (e.g. if user has specified --socket-mem
+ * such that not all memory adds up to 2 gigabytes), so add it to all
+ * sockets that are in use equally.
+ *
+ * page sizes are sorted by size in descending order, so we can safely
+ * assume that we dispense with bigger page sizes first.
+ */
+
+ /* create memseg lists */
+ for (i = 0; i < rte_socket_count(); i++) {
+ int hp_sizes = (int) internal_config.num_hugepage_sizes;
+ uint64_t max_socket_mem, cur_socket_mem;
+ unsigned int master_lcore_socket;
+ struct rte_config *cfg = rte_eal_get_configuration();
+ bool skip;
+
+ socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (socket_id > 0)
+ break;
+#endif
+
+ /* if we didn't specifically request memory on this socket */
+ skip = active_sockets != 0 &&
+ internal_config.socket_mem[socket_id] == 0;
+ /* ...or if we didn't specifically request memory on *any*
+ * socket, and this is not master lcore
+ */
+ master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
+ skip |= active_sockets == 0 && socket_id != master_lcore_socket;
+
+ if (skip) {
+ RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
+ socket_id);
+ continue;
+ }
+
+ /* max amount of memory on this socket */
+ max_socket_mem = (active_sockets != 0 ?
+ internal_config.socket_mem[socket_id] :
+ internal_config.memory) +
+ extra_mem_per_socket;
+ cur_socket_mem = 0;
+
+ for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
+ uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
+ uint64_t hugepage_sz;
+ struct hugepage_info *hpi;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ max_pagesz_mem = max_socket_mem - cur_socket_mem;
+
+ /* make it multiple of page size */
+ max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
+ hugepage_sz);
+
+ RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
+ "%" PRIu64 "M on socket %i\n",
+ max_pagesz_mem >> 20, socket_id);
+
+ type_msl_idx = 0;
+ while (cur_pagesz_mem < max_pagesz_mem &&
+ total_segs < max_segs) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, hugepage_sz,
+ max_pagesz_mem, socket_id,
+ type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ cur_pagesz_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
+ }
+ cur_socket_mem += cur_pagesz_mem;
+ }
+ }
+
+ return 0;
+}
+
+static int __rte_unused
+memseg_primary_init(void)
{
- return rte_eal_get_configuration()->mem_config->memseg;
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, socket_id, hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+ uint64_t max_mem, total_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ total_mem = 0;
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (i = 0; i < (int) rte_socket_count(); i++) {
+ uint64_t max_type_mem, total_type_mem = 0;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (socket_id > 0)
+ break;
+#endif
+
+ max_type_mem = RTE_MIN(max_mem - total_mem,
+ (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+
+ type_msl_idx = 0;
+ while (total_type_mem < max_type_mem &&
+ total_segs < max_segs) {
+ uint64_t cur_max_mem;
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ cur_max_mem = max_type_mem - total_type_mem;
+ if (alloc_memseg_list(msl, hugepage_sz,
+ cur_max_mem, socket_id,
+ type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ total_type_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
+ }
+ total_mem += total_type_mem;
+ }
+ }
+ return 0;
+}
+
+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static struct rte_memseg *
+virt2memseg(const void *addr, const struct rte_memseg_list *msl)
+{
+ const struct rte_fbarray *arr;
+ void *start, *end;
+ int ms_idx;
+
+ /* a memseg list was specified, check if it's the right one */
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
+
+ if (addr < start || addr >= end)
+ return NULL;
+
+ /* now, calculate index */
+ arr = &msl->memseg_arr;
+ ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
+ return rte_fbarray_get(arr, ms_idx);
+}
+
+static struct rte_memseg_list *
+virt2memseg_list(const void *addr)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ void *start, *end;
+ msl = &mcfg->memsegs[msl_idx];
+
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start,
+ (size_t)msl->page_sz * msl->memseg_arr.len);
+ if (addr >= start && addr < end)
+ break;
+ }
+ /* if we didn't find our memseg list */
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS)
+ return NULL;
+ return msl;
+}
+
+__rte_experimental struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *addr)
+{
+ return virt2memseg_list(addr);
}
struct virtiova {
@@ -136,7 +518,8 @@ struct virtiova {
void *virt;
};
static int
-find_virt(const struct rte_memseg *ms, void *arg)
+find_virt(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
struct virtiova *vi = arg;
if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
@@ -147,6 +530,19 @@ find_virt(const struct rte_memseg *ms, void *arg)
}
return 0;
}
+static int
+find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ struct virtiova *vi = arg;
+ if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
+ size_t offset = vi->iova - ms->iova;
+ vi->virt = RTE_PTR_ADD(ms->addr, offset);
+ /* stop the walk */
+ return 1;
+ }
+ return 0;
+}
__rte_experimental void *
rte_mem_iova2virt(rte_iova_t iova)
@@ -156,54 +552,30 @@ rte_mem_iova2virt(rte_iova_t iova)
memset(&vi, 0, sizeof(vi));
vi.iova = iova;
- rte_memseg_walk(find_virt, &vi);
+ /* for legacy mem, we can get away with scanning VA-contiguous segments,
+ * as we know they are PA-contiguous as well
+ */
+ if (internal_config.legacy_mem)
+ rte_memseg_contig_walk(find_virt_legacy, &vi);
+ else
+ rte_memseg_walk(find_virt, &vi);
return vi.virt;
}
-struct virtms {
- const void *virt;
- struct rte_memseg *ms;
-};
-static int
-find_memseg(const struct rte_memseg *ms, void *arg)
-{
- struct virtms *vm = arg;
-
- if (arg >= ms->addr && arg < RTE_PTR_ADD(ms->addr, ms->len)) {
- struct rte_memseg *memseg, *found_ms;
- int idx;
-
- memseg = rte_eal_get_configuration()->mem_config->memseg;
- idx = ms - memseg;
- found_ms = &memseg[idx];
-
- vm->ms = found_ms;
- return 1;
- }
- return 0;
-}
-
__rte_experimental struct rte_memseg *
-rte_mem_virt2memseg(const void *addr)
+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
{
- struct virtms vm;
-
- memset(&vm, 0, sizeof(vm));
-
- vm.virt = addr;
-
- rte_memseg_walk(find_memseg, &vm);
-
- return vm.ms;
+ return virt2memseg(addr, msl != NULL ? msl :
+ rte_mem_virt2memseg_list(addr));
}
static int
-physmem_size(const struct rte_memseg *ms, void *arg)
+physmem_size(const struct rte_memseg_list *msl, void *arg)
{
uint64_t *total_len = arg;
- *total_len += ms->len;
+ *total_len += msl->memseg_arr.count * msl->page_sz;
return 0;
}
@@ -214,32 +586,39 @@ rte_eal_get_physmem_size(void)
{
uint64_t total_len = 0;
- rte_memseg_walk(physmem_size, &total_len);
+ rte_memseg_list_walk(physmem_size, &total_len);
return total_len;
}
static int
-dump_memseg(const struct rte_memseg *ms, void *arg)
+dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i = ms - mcfg->memseg;
+ int msl_idx, ms_idx;
FILE *f = arg;
- if (i < 0 || i >= RTE_MAX_MEMSEG)
+ msl_idx = msl - mcfg->memsegs;
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
return -1;
- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0)
+ return -1;
+
+ fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
"virt:%p, socket_id:%"PRId32", "
"hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
+ "nrank:%"PRIx32"\n",
+ msl_idx, ms_idx,
+ ms->iova,
+ ms->len,
+ ms->addr,
+ ms->socket_id,
+ ms->hugepage_sz,
+ ms->nchannel,
+ ms->nrank);
return 0;
}
@@ -289,55 +668,89 @@ rte_mem_lock_page(const void *virt)
}
int __rte_experimental
-rte_memseg_walk(rte_memseg_walk_t func, void *arg)
+rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, ret;
+ int i, ms_idx, ret = 0;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- const struct rte_memseg *ms = &mcfg->memseg[i];
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;
- if (ms->addr == NULL)
+ if (msl->memseg_arr.count == 0)
continue;
- ret = func(ms, arg);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 1;
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ int n_segs;
+ size_t len;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ /* find how many more segments there are, starting with
+ * this one.
+ */
+ n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
+ len = n_segs * msl->page_sz;
+
+ ret = func(msl, ms, len, arg);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr,
+ ms_idx + n_segs);
+ }
}
return 0;
}
int __rte_experimental
-rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
+rte_memseg_walk(rte_memseg_walk_t func, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, j, ret;
+ int i, ms_idx, ret = 0;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- const struct rte_memseg *ms = &mcfg->memseg[i];
- size_t total_len;
- void *end_addr;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;
- if (ms->addr == NULL)
+ if (msl->memseg_arr.count == 0)
continue;
- end_addr = RTE_PTR_ADD(ms->addr, ms->len);
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ ms = rte_fbarray_get(arr, ms_idx);
+ ret = func(msl, ms, arg);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
+ }
+ }
+ return 0;
+}
- /* check how many more segments are contiguous to this one */
- for (j = i + 1; j < RTE_MAX_MEMSEG; j++) {
- const struct rte_memseg *next = &mcfg->memseg[j];
+int __rte_experimental
+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret = 0;
- if (next->addr != end_addr)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
- end_addr = RTE_PTR_ADD(next->addr, next->len);
- i++;
- }
- total_len = RTE_PTR_DIFF(end_addr, ms->addr);
+ if (msl->base_va == NULL)
+ continue;
- ret = func(ms, total_len, arg);
+ ret = func(msl, arg);
if (ret < 0)
return -1;
if (ret > 0)
@@ -350,9 +763,25 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
int
rte_eal_memory_init(void)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
- const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ if (!mcfg)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+#ifndef RTE_ARCH_64
+ memseg_primary_init_32() :
+#else
+ memseg_primary_init() :
+#endif
+ memseg_secondary_init();
+
+ if (retval < 0)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index d60bde7..1f5f753 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -239,10 +239,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len);
- mz->hugepage_sz = elem->ms->hugepage_sz;
- mz->socket_id = elem->ms->socket_id;
+ mz->hugepage_sz = elem->msl->page_sz;
+ mz->socket_id = elem->msl->socket_id;
mz->flags = 0;
- mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;
return mz;
}
@@ -364,20 +363,50 @@ static void
dump_memzone(const struct rte_memzone *mz, void *arg)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = NULL;
+ void *cur_addr, *mz_end;
+ struct rte_memseg *ms;
+ int mz_idx, ms_idx;
+ size_t page_sz;
FILE *f = arg;
- int mz_idx;
mz_idx = mz - mcfg->memzone;
- fprintf(f, "Zone %u: name:<%s>, IO:0x%"PRIx64", len:0x%zx, virt:%p, "
+ fprintf(f, "Zone %u: name:<%s>, len:0x%zx, virt:%p, "
"socket_id:%"PRId32", flags:%"PRIx32"\n",
mz_idx,
mz->name,
- mz->iova,
mz->len,
mz->addr,
mz->socket_id,
mz->flags);
+
+ /* go through each page occupied by this memzone */
+ msl = rte_mem_virt2memseg_list(mz->addr);
+ if (!msl) {
+ RTE_LOG(DEBUG, EAL, "Skipping bad memzone\n");
+ return;
+ }
+ page_sz = (size_t)mz->hugepage_sz;
+ cur_addr = RTE_PTR_ALIGN_FLOOR(mz->addr, page_sz);
+ mz_end = RTE_PTR_ADD(cur_addr, mz->len);
+
+ fprintf(f, "physical segments used:\n");
+ ms_idx = RTE_PTR_DIFF(mz->addr, msl->base_va) / page_sz;
+ ms = rte_fbarray_get(&msl->memseg_arr, ms_idx);
+
+ do {
+ fprintf(f, " addr: %p iova: 0x%" PRIx64 " "
+ "len: 0x%zx "
+ "pagesz: 0x%zx\n",
+ cur_addr, ms->iova, ms->len, page_sz);
+
+ /* advance VA to next page */
+ cur_addr = RTE_PTR_ADD(cur_addr, page_sz);
+
+ /* memzones occupy contiguous segments */
+ ++ms;
+ } while (cur_addr < mz_end);
}
/* Dump all reserved memory zones on console */
@@ -394,7 +423,6 @@ int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
- const struct rte_memseg *memseg;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -403,12 +431,6 @@ rte_eal_memzone_init(void)
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
return 0;
- memseg = rte_eal_get_physmem_layout();
- if (memseg == NULL) {
- RTE_LOG(ERR, EAL, "%s(): Cannot get physical layout\n", __func__);
- return -1;
- }
-
rte_rwlock_write_lock(&mcfg->mlock);
/* delete all zones */
diff --git a/lib/librte_eal/common/eal_hugepages.h b/lib/librte_eal/common/eal_hugepages.h
index 1d519bb..ad1b0b6 100644
--- a/lib/librte_eal/common/eal_hugepages.h
+++ b/lib/librte_eal/common/eal_hugepages.h
@@ -22,7 +22,6 @@ struct hugepage_file {
size_t size; /**< the page size */
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
- int memseg_id; /**< the memory segment to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index fda087b..5cf7102 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -23,7 +23,7 @@ struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
- /**< number of hugepages of that size on each socket */
+ /**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 29fa0b6..b745e18 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -12,12 +12,30 @@
#include <rte_malloc_heap.h>
#include <rte_rwlock.h>
#include <rte_pause.h>
+#include <rte_fbarray.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
+ * memseg list is a special case as we need to store a bunch of other data
+ * together with the array itself.
+ */
+struct rte_memseg_list {
+ RTE_STD_C11
+ union {
+ void *base_va;
+ /**< Base virtual address for this memseg list. */
+ uint64_t addr_64;
+ /**< Makes sure addr is always 64-bits */
+ };
+ int socket_id; /**< Socket ID for all memsegs in this list. */
+ uint64_t page_sz; /**< Page size for all memsegs in this list. */
+ struct rte_fbarray memseg_arr;
+};
+
+/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
@@ -43,9 +61,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */
/* memory segments and zones */
- struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
+ struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
+ /**< list of dynamic arrays holding memsegs */
+
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */
/* Heaps of Malloc per socket */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index b3d7e61..f31227b 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -23,6 +23,9 @@ extern "C" {
#include <rte_compat.h>
#include <rte_config.h>
+/* forward declaration for pointers */
+struct rte_memseg_list;
+
__extension__
enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12,
@@ -147,11 +150,25 @@ rte_mem_iova2virt(rte_iova_t iova);
*
* @param virt
* The virtual address.
+ * @param msl
+ * The memseg list in which to look up based on ``virt`` address
+ * (can be NULL).
* @return
* Memseg pointer on success, or NULL on error.
*/
__rte_experimental struct rte_memseg *
-rte_mem_virt2memseg(const void *virt);
+rte_mem_virt2memseg(const void *virt, const struct rte_memseg_list *msl);
+
+/**
+ * Get memseg list corresponding to virtual memory address.
+ *
+ * @param virt
+ * The virtual address.
+ * @return
+ * Memseg list to which this virtual address belongs to.
+ */
+__rte_experimental struct rte_memseg_list *
+rte_mem_virt2memseg_list(const void *virt);
/**
* Memseg walk function prototype.
@@ -160,7 +177,8 @@ rte_mem_virt2memseg(const void *virt);
* Returning 1 will stop the walk
* Returning -1 will stop the walk and report error
*/
-typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
+typedef int (*rte_memseg_walk_t)(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg);
/**
* Memseg contig walk function prototype. This will trigger a callback on every
@@ -171,8 +189,19 @@ typedef int (*rte_memseg_walk_t)(const struct rte_memseg *ms, void *arg);
* Returning 1 will stop the walk
* Returning -1 will stop the walk and report error
*/
-typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg *ms,
- size_t len, void *arg);
+typedef int (*rte_memseg_contig_walk_t)(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg);
+
+/**
+ * Memseg list walk function prototype. This will trigger a callback on every
+ * allocated memseg list.
+ *
+ * Returning 0 will continue walk
+ * Returning 1 will stop the walk
+ * Returning -1 will stop the walk and report error
+ */
+typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl,
+ void *arg);
/**
* Walk list of all memsegs.
@@ -205,21 +234,19 @@ int __rte_experimental
rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg);
/**
- * Get the layout of the available physical memory.
- *
- * It can be useful for an application to have the full physical
- * memory layout to decide the size of a memory zone to reserve. This
- * table is stored in rte_config (see rte_eal_get_configuration()).
+ * Walk each allocated memseg list.
*
+ * @param func
+ * Iterator function
+ * @param arg
+ * Argument passed to iterator
* @return
- * - On success, return a pointer to a read-only table of struct
- * rte_physmem_desc elements, containing the layout of all
- * addressable physical memory. The last element of the table
- * contains a NULL address.
- * - On error, return NULL. This should not happen since it is a fatal
- * error that will probably cause the entire system to panic.
+ * 0 if walked over the entire list
+ * 1 if stopped by the user
+ * -1 if user function reported error
*/
-const struct rte_memseg *rte_eal_get_physmem_layout(void);
+int __rte_experimental
+rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg);
/**
* Dump the physical memory layout to a file.
diff --git a/lib/librte_eal/common/include/rte_memzone.h b/lib/librte_eal/common/include/rte_memzone.h
index e2630fd..0eeb94f 100644
--- a/lib/librte_eal/common/include/rte_memzone.h
+++ b/lib/librte_eal/common/include/rte_memzone.h
@@ -68,7 +68,6 @@ struct rte_memzone {
int32_t socket_id; /**< NUMA socket ID. */
uint32_t flags; /**< Characteristics of this memzone. */
- uint32_t memseg_id; /**< Memseg it belongs. */
} __attribute__((__packed__));
/**
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index 87695b9..685aac4 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -27,11 +27,11 @@
* Initialize a general malloc_elem header structure
*/
void
-malloc_elem_init(struct malloc_elem *elem,
- struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
+ struct rte_memseg_list *msl, size_t size)
{
elem->heap = heap;
- elem->ms = ms;
+ elem->msl = msl;
elem->prev = NULL;
elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
@@ -100,7 +100,7 @@ malloc_elem_insert(struct malloc_elem *elem)
* so we just check the page addresses.
*/
static bool
-elem_check_phys_contig(const struct rte_memseg *ms __rte_unused,
+elem_check_phys_contig(const struct rte_memseg_list *msl __rte_unused,
void *start, size_t size)
{
rte_iova_t cur, expected;
@@ -191,7 +191,7 @@ elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align,
* couldn't fit all data into one physically contiguous
* block, try again with lower addresses.
*/
- if (!elem_check_phys_contig(elem->ms,
+ if (!elem_check_phys_contig(elem->msl,
(void *)new_data_start,
new_data_size)) {
elem_size -= align;
@@ -225,7 +225,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;
- malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
+ malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem;
split_pt->next = next_elem;
if (next_elem)
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 34bd268..620dd44 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -7,7 +7,7 @@
#include <stdbool.h>
-#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap;
@@ -26,7 +26,7 @@ struct malloc_elem {
/**< points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list;
/**< list of free elements in heap */
- const struct rte_memseg *ms;
+ struct rte_memseg_list *msl;
volatile enum elem_state state;
uint32_t pad;
size_t size;
@@ -113,7 +113,7 @@ malloc_elem_from_data(const void *data)
void
malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap,
- const struct rte_memseg *ms,
+ struct rte_memseg_list *msl,
size_t size);
void
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 79914fc..0ef2c45 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -21,6 +21,7 @@
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include "eal_internal_cfg.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
@@ -62,36 +63,49 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
}
/*
- * Expand the heap with a memseg.
- * This reserves the zone and sets a dummy malloc_elem header at the end
- * to prevent overflow. The rest of the zone is added to free list as a single
- * large free block
+ * Expand the heap with a memory area.
*/
+static struct malloc_elem *
+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ void *start, size_t len)
+{
+ struct malloc_elem *elem = start;
+
+ malloc_elem_init(elem, heap, msl, len);
+
+ malloc_elem_insert(elem);
+
+ elem = malloc_elem_join_adjacent_free(elem);
+
+ malloc_elem_free_list_insert(elem);
+
+ heap->total_size += len;
+
+ return elem;
+}
+
static int
-malloc_heap_add_memseg(const struct rte_memseg *ms, void *arg __rte_unused)
+malloc_add_seg(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg __rte_unused)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_elem *start_elem;
- struct rte_memseg *found_ms;
+ struct rte_memseg_list *found_msl;
struct malloc_heap *heap;
- size_t elem_size;
- int ms_idx;
-
- heap = &mcfg->malloc_heaps[ms->socket_id];
+ int msl_idx;
- /* ms is const, so find it */
- ms_idx = ms - mcfg->memseg;
- found_ms = &mcfg->memseg[ms_idx];
+ heap = &mcfg->malloc_heaps[msl->socket_id];
- start_elem = (struct malloc_elem *)found_ms->addr;
- elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ /* msl is const, so find it */
+ msl_idx = msl - mcfg->memsegs;
+ found_msl = &mcfg->memsegs[msl_idx];
- malloc_elem_init(start_elem, heap, found_ms, elem_size);
- malloc_elem_insert(start_elem);
- malloc_elem_free_list_insert(start_elem);
+ if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
+ return -1;
- heap->total_size += elem_size;
+ malloc_heap_add_memory(heap, found_msl, ms->addr, len);
+ RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20,
+ msl->socket_id);
return 0;
}
@@ -114,7 +128,8 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
!!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound,
contig)) {
- if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+ if (check_hugepage_sz(flags,
+ elem->msl->page_sz))
return elem;
if (alt_elem == NULL)
alt_elem = elem;
@@ -263,7 +278,6 @@ rte_eal_malloc_heap_init(void)
if (mcfg == NULL)
return -1;
- rte_memseg_walk(malloc_heap_add_memseg, NULL);
-
- return 0;
+ /* add all IOVA-contiguous areas to the heap */
+ return rte_memseg_contig_walk(malloc_add_seg, NULL);
}
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 436818a..c6d3e57 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -242,17 +242,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t
rte_malloc_virt2iova(const void *addr)
{
- rte_iova_t iova;
- const struct malloc_elem *elem = malloc_elem_from_data(addr);
+ const struct rte_memseg *ms;
+ struct malloc_elem *elem = malloc_elem_from_data(addr);
+
if (elem == NULL)
return RTE_BAD_IOVA;
- if (elem->ms->iova == RTE_BAD_IOVA)
- return RTE_BAD_IOVA;
if (rte_eal_iova_mode() == RTE_IOVA_VA)
- iova = (uintptr_t)addr;
- else
- iova = elem->ms->iova +
- RTE_PTR_DIFF(addr, elem->ms->addr);
- return iova;
+ return (uintptr_t) addr;
+
+ ms = rte_mem_virt2memseg(addr, elem->msl);
+ if (ms == NULL)
+ return RTE_BAD_IOVA;
+
+ if (ms->iova == RTE_BAD_IOVA)
+ return RTE_BAD_IOVA;
+
+ return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index b34e57a..ffcbd71 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -74,8 +74,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};
/* Address of global and public configuration */
@@ -640,11 +640,14 @@ out:
}
static int
-check_mem(const struct rte_memseg *ms, void *arg)
+check_socket(const struct rte_memseg_list *msl, void *arg)
{
- int *socket = arg;
+ int *socket_id = arg;
- return ms->socket_id == *socket;
+ if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
+ return 1;
+
+ return 0;
}
static void
@@ -654,7 +657,7 @@ eal_check_mem_on_local_socket(void)
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
- if (rte_memseg_walk(check_mem, &socket_id) == 0)
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 8bbf771..afebd42 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -15,6 +15,7 @@
#include <unistd.h>
#include <errno.h>
#include <sys/queue.h>
+#include <sys/stat.h>
#include <rte_memory.h>
#include <rte_eal.h>
@@ -160,6 +161,18 @@ get_hugepage_dir(uint64_t hugepage_sz)
}
/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
* Clear the hugepage directory of whatever hugepage files
* there are. Checks if the file is locked (i.e.
* if it's in use by another DPDK process).
@@ -189,6 +202,8 @@ clear_hugedir(const char * hugedir)
}
while(dirent != NULL){
+ struct flock lck = {0};
+
/* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir);
@@ -205,11 +220,17 @@ clear_hugedir(const char * hugedir)
}
/* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = get_file_size(fd);
+
+ lck_result = fcntl(fd, F_SETLK, &lck);
/* if lock succeeds, unlock and remove the file */
if (lck_result != -1) {
- flock(fd, LOCK_UN);
+ lck.l_type = F_UNLCK;
+ fcntl(fd, F_SETLK, &lck);
unlinkat(dir_fd, dirent->d_name, 0);
}
close (fd);
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 17c559f..daab364 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -253,13 +253,12 @@ void numa_error(char *where)
*/
static unsigned
map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
- uint64_t *essential_memory __rte_unused, int orig)
+ uint64_t *essential_memory __rte_unused)
{
int fd;
unsigned i;
void *virtaddr;
- void *vma_addr = NULL;
- size_t vma_len = 0;
+ struct flock lck = {0};
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
int node_id = -1;
int essential_prev = 0;
@@ -274,7 +273,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
have_numa = false;
}
- if (orig && have_numa) {
+ if (have_numa) {
RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
if (get_mempolicy(&oldpolicy, oldmask->maskp,
oldmask->size + 1, 0, 0) < 0) {
@@ -290,6 +289,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
#endif
for (i = 0; i < hpi->num_pages[0]; i++) {
+ struct hugepage_file *hf = &hugepg_tbl[i];
uint64_t hugepage_sz = hpi->hugepage_sz;
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
@@ -324,66 +324,14 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}
#endif
- if (orig) {
- hugepg_tbl[i].file_id = i;
- hugepg_tbl[i].size = hugepage_sz;
- eal_get_hugefile_path(hugepg_tbl[i].filepath,
- sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
- hugepg_tbl[i].file_id);
- hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
- }
-#ifndef RTE_ARCH_64
- /* for 32-bit systems, don't remap 1G and 16G pages, just reuse
- * original map address as final map address.
- */
- else if ((hugepage_sz == RTE_PGSIZE_1G)
- || (hugepage_sz == RTE_PGSIZE_16G)) {
- hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
- hugepg_tbl[i].orig_va = NULL;
- continue;
- }
-#endif
- else if (vma_len == 0) {
- unsigned j, num_pages;
-
- /* reserve a virtual area for next contiguous
- * physical block: count the number of
- * contiguous physical pages. */
- for (j = i+1; j < hpi->num_pages[0] ; j++) {
-#ifdef RTE_ARCH_PPC_64
- /* The physical addresses are sorted in
- * descending order on PPC64 */
- if (hugepg_tbl[j].physaddr !=
- hugepg_tbl[j-1].physaddr - hugepage_sz)
- break;
-#else
- if (hugepg_tbl[j].physaddr !=
- hugepg_tbl[j-1].physaddr + hugepage_sz)
- break;
-#endif
- }
- num_pages = j - i;
- vma_len = num_pages * hugepage_sz;
-
- /* get the biggest virtual memory area up to
- * vma_len. If it fails, vma_addr is NULL, so
- * let the kernel provide the address. */
- vma_addr = eal_get_virtual_area(NULL, &vma_len,
- hpi->hugepage_sz,
- EAL_VIRTUAL_AREA_ALLOW_SHRINK |
- EAL_VIRTUAL_AREA_UNMAP,
-#ifdef RTE_ARCH_PPC_64
- MAP_HUGETLB
-#else
- 0
-#endif
- );
- if (vma_addr == NULL)
- vma_len = hugepage_sz;
- }
+ hf->file_id = i;
+ hf->size = hugepage_sz;
+ eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
+ hpi->hugedir, hf->file_id);
+ hf->filepath[sizeof(hf->filepath) - 1] = '\0';
/* try to create hugepage file */
- fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600);
+ fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
if (fd < 0) {
RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
strerror(errno));
@@ -391,8 +339,11 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}
/* map the segment, and populate page tables,
- * the kernel fills this segment with zeros */
- virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
+ * the kernel fills this segment with zeros. we don't care where
+ * this gets mapped - we already have contiguous memory areas
+ * ready for us to map into.
+ */
+ virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (virtaddr == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
@@ -401,44 +352,38 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
goto out;
}
- if (orig) {
- hugepg_tbl[i].orig_va = virtaddr;
- }
- else {
- /* rewrite physical addresses in IOVA as VA mode */
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- hugepg_tbl[i].physaddr = (uintptr_t)virtaddr;
- hugepg_tbl[i].final_va = virtaddr;
- }
+ hf->orig_va = virtaddr;
- if (orig) {
- /* In linux, hugetlb limitations, like cgroup, are
- * enforced at fault time instead of mmap(), even
- * with the option of MAP_POPULATE. Kernel will send
- * a SIGBUS signal. To avoid to be killed, save stack
- * environment here, if SIGBUS happens, we can jump
- * back here.
- */
- if (huge_wrap_sigsetjmp()) {
- RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
- "hugepages of size %u MB\n",
- (unsigned)(hugepage_sz / 0x100000));
- munmap(virtaddr, hugepage_sz);
- close(fd);
- unlink(hugepg_tbl[i].filepath);
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+ "hugepages of size %u MB\n",
+ (unsigned int)(hugepage_sz / 0x100000));
+ munmap(virtaddr, hugepage_sz);
+ close(fd);
+ unlink(hugepg_tbl[i].filepath);
#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (maxnode)
- essential_memory[node_id] =
- essential_prev;
+ if (maxnode)
+ essential_memory[node_id] =
+ essential_prev;
#endif
- goto out;
- }
- *(int *)virtaddr = 0;
+ goto out;
}
+ *(int *)virtaddr = 0;
- /* set shared flock on the file. */
- if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = hugepage_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
__func__, strerror(errno));
close(fd);
@@ -446,9 +391,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
}
close(fd);
-
- vma_addr = (char *)vma_addr + hugepage_sz;
- vma_len -= hugepage_sz;
}
out:
@@ -470,20 +412,6 @@ out:
return i;
}
-/* Unmap all hugepages from original mapping */
-static int
-unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
- unsigned i;
- for (i = 0; i < hpi->num_pages[0]; i++) {
- if (hugepg_tbl[i].orig_va) {
- munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
- hugepg_tbl[i].orig_va = NULL;
- }
- }
- return 0;
-}
-
/*
* Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
* page.
@@ -623,7 +551,7 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
int src_pos, dst_pos = 0;
for (src_pos = 0; src_pos < src_size; src_pos++) {
- if (src[src_pos].final_va != NULL) {
+ if (src[src_pos].orig_va != NULL) {
/* error on overflow attempt */
if (dst_pos == dest_size)
return -1;
@@ -694,9 +622,10 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
unmap_len = hp->size;
/* get start addr and len of the remaining segment */
- munmap(hp->final_va, (size_t) unmap_len);
+ munmap(hp->orig_va,
+ (size_t)unmap_len);
- hp->final_va = NULL;
+ hp->orig_va = NULL;
if (unlink(hp->filepath) == -1) {
RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
__func__, hp->filepath, strerror(errno));
@@ -715,6 +644,413 @@ unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
return 0;
}
+static int
+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int cur_page, seg_len;
+ unsigned int msl_idx;
+ int ms_idx;
+ uint64_t page_sz;
+ size_t memseg_len;
+ int socket_id;
+
+ page_sz = hugepages[seg_start].size;
+ socket_id = hugepages[seg_start].socket_id;
+ seg_len = seg_end - seg_start;
+
+ RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20ULL, socket_id);
+
+ /* find free space in memseg lists */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ bool empty;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket_id)
+ continue;
+
+ /* leave space for a hole if array is not empty */
+ empty = arr->count == 0;
+ ms_idx = rte_fbarray_find_next_n_free(arr, 0,
+ seg_len + (empty ? 0 : 1));
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ /* leave some space between memsegs, they are not IOVA
+ * contiguous, so they shouldn't be VA contiguous either.
+ */
+ if (!empty)
+ ms_idx++;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+
+#ifdef RTE_ARCH_PPC64
+ /* for PPC64 we go through the list backwards */
+ for (cur_page = seg_end - 1; cur_page >= seg_start;
+ cur_page--, ms_idx++) {
+#else
+ for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
+#endif
+ struct hugepage_file *hfile = &hugepages[cur_page];
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ struct flock lck;
+ void *addr;
+ int fd;
+
+ fd = open(hfile->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ return -1;
+ }
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = page_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
+ RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ memseg_len = (size_t)page_sz;
+ addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+
+ /* we know this address is already mmapped by memseg list, so
+ * using MAP_FIXED here is safe
+ */
+ addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ /* we have a new address, so unmap previous one */
+#ifndef RTE_ARCH_64
+ /* in 32-bit legacy mode, we have already unmapped the page */
+ if (!internal_config.legacy_mem)
+ munmap(hfile->orig_va, page_sz);
+#else
+ munmap(hfile->orig_va, page_sz);
+#endif
+
+ hfile->orig_va = NULL;
+ hfile->final_va = addr;
+
+ /* rewrite physical addresses in IOVA as VA mode */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ hfile->physaddr = (uintptr_t)addr;
+
+ /* set up memseg data */
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = memseg_len;
+ ms->iova = hfile->physaddr;
+ ms->socket_id = hfile->socket_id;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+
+ rte_fbarray_set_used(arr, ms_idx);
+
+ close(fd);
+ }
+ RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20, socket_id);
+ return 0;
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int n_segs, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+
+ return 0;
+}
+
+/*
+ * Our VA space is not preallocated yet, so preallocate it here. We need to know
+ * how many segments there are in order to map all pages into one address space,
+ * and leave appropriate holes between segments so that rte_malloc does not
+ * concatenate them into one big segment.
+ *
+ * we also need to unmap original pages to free up address space.
+ */
+static int __rte_unused
+prealloc_segments(struct hugepage_file *hugepages, int n_pages)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int cur_page, seg_start_page, end_seg, new_memseg;
+ unsigned int hpi_idx, socket, i;
+ int n_contig_segs, n_segs;
+ int msl_idx;
+
+ /* before we preallocate segments, we need to free up our VA space.
+ * we're not removing files, and we already have information about
+ * PA-contiguousness, so it is safe to unmap everything.
+ */
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *hpi = &hugepages[cur_page];
+ munmap(hpi->orig_va, hpi->size);
+ hpi->orig_va = NULL;
+ }
+
+ /* we cannot know how many page sizes and sockets we have discovered, so
+ * loop over all of them
+ */
+ for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ uint64_t page_sz =
+ internal_config.hugepage_info[hpi_idx].hugepage_sz;
+
+ for (i = 0; i < rte_socket_count(); i++) {
+ struct rte_memseg_list *msl;
+
+ socket = rte_socket_id_by_idx(i);
+ n_contig_segs = 0;
+ n_segs = 0;
+ seg_start_page = -1;
+
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+ int prev_seg_start_page = -1;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL :
+ &hugepages[cur_page - 1];
+
+ new_memseg = 0;
+ end_seg = 0;
+
+ if (cur->size == 0)
+ end_seg = 1;
+ else if (cur->socket_id != (int) socket)
+ end_seg = 1;
+ else if (cur->size != page_sz)
+ end_seg = 1;
+ else if (cur_page == 0)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start
+ * from higher address to lower address. Here,
+ * physical addresses are in descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#endif
+ if (new_memseg) {
+ /* if we're already inside a segment,
+ * new segment means end of current one
+ */
+ if (seg_start_page != -1) {
+ end_seg = 1;
+ prev_seg_start_page =
+ seg_start_page;
+ }
+ seg_start_page = cur_page;
+ }
+
+ if (end_seg) {
+ if (prev_seg_start_page != -1) {
+ /* we've found a new segment */
+ n_contig_segs++;
+ n_segs += cur_page -
+ prev_seg_start_page;
+ } else if (seg_start_page != -1) {
+ /* we didn't find new segment,
+ * but did end current one
+ */
+ n_contig_segs++;
+ n_segs += cur_page -
+ seg_start_page;
+ seg_start_page = -1;
+ continue;
+ } else {
+ /* we're skipping this page */
+ continue;
+ }
+ }
+ /* segment continues */
+ }
+ /* check if we missed last segment */
+ if (seg_start_page != -1) {
+ n_contig_segs++;
+ n_segs += cur_page - seg_start_page;
+ }
+
+ /* if no segments were found, do not preallocate */
+ if (n_segs == 0)
+ continue;
+
+ /* we now have total number of pages that we will
+ * allocate for this segment list. add separator pages
+ * to the total count, and preallocate VA space.
+ */
+ n_segs += n_contig_segs - 1;
+
+ /* now, preallocate VA space for these segments */
+
+ /* first, find suitable memseg list for this */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+
+ if (msl->base_va != NULL)
+ continue;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ /* now, allocate fbarray itself */
+ if (alloc_memseg_list(msl, page_sz, n_segs, socket,
+ msl_idx) < 0)
+ return -1;
+
+ /* finally, allocate VA space */
+ if (alloc_va_space(msl) < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages
+ * backwards, therefore we have to process the entire memseg first before
+ * remapping it into memseg list VA space.
+ */
+static int
+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
+{
+ int cur_page, seg_start_page, new_memseg, ret;
+
+ seg_start_page = 0;
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+
+ new_memseg = 0;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
+
+ /* if size is zero, no more pages left */
+ if (cur->size == 0)
+ break;
+
+ if (cur_page == 0)
+ new_memseg = 1;
+ else if (cur->socket_id != prev->socket_id)
+ new_memseg = 1;
+ else if (cur->size != prev->size)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start from higher
+ * address to lower address. Here, physical addresses are in
+ * descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) != cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) != cur->size)
+ new_memseg = 1;
+#endif
+
+ if (new_memseg) {
+ /* if this isn't the first time, remap segment */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ /* remember where we started */
+ seg_start_page = cur_page;
+ }
+ /* continuation of previous memseg */
+ }
+ /* we were stopped, but we didn't remap the last segment, do it now */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ return 0;
+}
+
static inline uint64_t
get_socket_mem_size(int socket)
{
@@ -753,8 +1089,10 @@ calc_num_pages_per_socket(uint64_t * memory,
/* if specific memory amounts per socket weren't requested */
if (internal_config.force_sockets == 0) {
+ size_t total_size;
+#ifdef RTE_ARCH_64
int cpu_per_socket[RTE_MAX_NUMA_NODES];
- size_t default_size, total_size;
+ size_t default_size;
unsigned lcore_id;
/* Compute number of cores per socket */
@@ -772,7 +1110,7 @@ calc_num_pages_per_socket(uint64_t * memory,
/* Set memory amount per socket */
default_size = (internal_config.memory * cpu_per_socket[socket])
- / rte_lcore_count();
+ / rte_lcore_count();
/* Limit to maximum available memory on socket */
default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
@@ -789,12 +1127,33 @@ calc_num_pages_per_socket(uint64_t * memory,
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
/* take whatever is available */
default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
- total_size);
+ total_size);
/* Update sizes */
memory[socket] += default_size;
total_size -= default_size;
}
+#else
+ /* in 32-bit mode, allocate all of the memory only on master
+ * lcore socket
+ */
+ total_size = internal_config.memory;
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+ socket++) {
+ struct rte_config *cfg = rte_eal_get_configuration();
+ unsigned int master_lcore_socket;
+
+ master_lcore_socket =
+ rte_lcore_to_socket_id(cfg->master_lcore);
+
+ if (master_lcore_socket != socket)
+ continue;
+
+ /* Update sizes */
+ memory[socket] = total_size;
+ break;
+ }
+#endif
}
for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
@@ -842,7 +1201,8 @@ calc_num_pages_per_socket(uint64_t * memory,
}
}
/* if we didn't satisfy all memory requirements per socket */
- if (memory[socket] > 0) {
+ if (memory[socket] > 0 &&
+ internal_config.socket_mem[socket] != 0) {
/* to prevent icc errors */
requested = (unsigned) (internal_config.socket_mem[socket] /
0x100000);
@@ -928,11 +1288,13 @@ eal_legacy_hugepage_init(void)
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
uint64_t memory[RTE_MAX_NUMA_NODES];
unsigned hp_offset;
- int i, j, new_memseg;
+ int i, j;
int nr_hugefiles, nr_hugepages = 0;
void *addr;
@@ -945,6 +1307,25 @@ eal_legacy_hugepage_init(void)
/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
+ struct rte_memseg_list *msl;
+ uint64_t page_sz;
+ int n_segs, cur_seg;
+
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
@@ -952,14 +1333,28 @@ eal_legacy_hugepage_init(void)
strerror(errno));
return -1;
}
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- mcfg->memseg[0].iova = (uintptr_t)addr;
- else
- mcfg->memseg[0].iova = RTE_BAD_IOVA;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+
+ /* populate memsegs. each memseg is one page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &msl->memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->socket_id = 0;
+ ms->len = page_sz;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+ }
return 0;
}
@@ -992,7 +1387,6 @@ eal_legacy_hugepage_init(void)
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
memory[i] = internal_config.socket_mem[i];
-
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
unsigned pages_old, pages_new;
@@ -1010,8 +1404,7 @@ eal_legacy_hugepage_init(void)
/* map all hugepages available */
pages_old = hpi->num_pages[0];
- pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
- memory, 1);
+ pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
if (pages_new < pages_old) {
RTE_LOG(DEBUG, EAL,
"%d not %d hugepages of size %u MB allocated\n",
@@ -1054,18 +1447,6 @@ eal_legacy_hugepage_init(void)
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
sizeof(struct hugepage_file), cmp_physaddr);
- /* remap all hugepages */
- if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
- hpi->num_pages[0]) {
- RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
- goto fail;
- }
-
- /* unmap original mappings */
- if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
- goto fail;
-
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
}
@@ -1148,7 +1529,7 @@ eal_legacy_hugepage_init(void)
/*
* copy stuff from malloc'd hugepage* to the actual shared memory.
- * this procedure only copies those hugepages that have final_va
+ * this procedure only copies those hugepages that have orig_va
* not NULL. has overflow protection.
*/
if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
@@ -1157,6 +1538,23 @@ eal_legacy_hugepage_init(void)
goto fail;
}
+#ifndef RTE_ARCH_64
+ /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
+ if (internal_config.legacy_mem &&
+ prealloc_segments(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
+ goto fail;
+ }
+#endif
+
+ /* remap all pages we do need into memseg list VA space, so that those
+ * pages become first-class citizens in DPDK memory subsystem
+ */
+ if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
+ goto fail;
+ }
+
/* free the hugepage backing files */
if (internal_config.hugepage_unlink &&
unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
@@ -1168,75 +1566,30 @@ eal_legacy_hugepage_init(void)
free(tmp_hp);
tmp_hp = NULL;
- /* first memseg index shall be 0 after incrementing it below */
- j = -1;
- for (i = 0; i < nr_hugefiles; i++) {
- new_memseg = 0;
-
- /* if this is a new section, create a new memseg */
- if (i == 0)
- new_memseg = 1;
- else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
- new_memseg = 1;
- else if (hugepage[i].size != hugepage[i-1].size)
- new_memseg = 1;
-
-#ifdef RTE_ARCH_PPC_64
- /* On PPC64 architecture, the mmap always start from higher
- * virtual address to lower address. Here, both the physical
- * address and virtual address are in descending order */
- else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
- hugepage[i].size)
- new_memseg = 1;
- else if (((unsigned long)hugepage[i-1].final_va -
- (unsigned long)hugepage[i].final_va) != hugepage[i].size)
- new_memseg = 1;
-#else
- else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
- hugepage[i].size)
- new_memseg = 1;
- else if (((unsigned long)hugepage[i].final_va -
- (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
- new_memseg = 1;
-#endif
+ munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
- if (new_memseg) {
- j += 1;
- if (j == RTE_MAX_MEMSEG)
- break;
+ /* we're not going to allocate more pages, so release VA space for
+ * unused memseg lists
+ */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ size_t mem_sz;
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
- mcfg->memseg[j].len = hugepage[i].size;
- mcfg->memseg[j].socket_id = hugepage[i].socket_id;
- mcfg->memseg[j].hugepage_sz = hugepage[i].size;
- }
- /* continuation of previous memseg */
- else {
-#ifdef RTE_ARCH_PPC_64
- /* Use the phy and virt address of the last page as segment
- * address for IBM Power architecture */
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
-#endif
- mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
- }
- hugepage[i].memseg_id = j;
- }
+ /* skip inactive lists */
+ if (msl->base_va == NULL)
+ continue;
+ /* skip lists where there is at least one page allocated */
+ if (msl->memseg_arr.count > 0)
+ continue;
+ /* this is an unused list, deallocate it */
+ mem_sz = (size_t)msl->page_sz * msl->memseg_arr.len;
+ munmap(msl->base_va, mem_sz);
+ msl->base_va = NULL;
- if (i < nr_hugefiles) {
- RTE_LOG(ERR, EAL, "Can only reserve %d pages "
- "from %d requested\n"
- "Current %s=%d is not enough\n"
- "Please either increase it or request less amount "
- "of memory.\n",
- i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
- RTE_MAX_MEMSEG);
- goto fail;
+ /* destroy backing fbarray */
+ rte_fbarray_destroy(&msl->memseg_arr);
}
- munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-
return 0;
fail:
@@ -1269,11 +1622,10 @@ getFileSize(int fd)
static int
eal_legacy_hugepage_attach(void)
{
- const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
- unsigned num_hp = 0;
- unsigned i, s = 0; /* s used to track the segment number */
- unsigned max_seg = RTE_MAX_MEMSEG;
+ unsigned int num_hp = 0;
+ unsigned int i = 0;
+ unsigned int cur_seg;
off_t size = 0;
int fd, fd_hugepage = -1;
@@ -1292,50 +1644,6 @@ eal_legacy_hugepage_attach(void)
goto error;
}
- /* map all segments into memory to make sure we get the addrs */
- for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
- void *base_addr;
- size_t mmap_sz;
- int mmap_flags = 0;
-
- /*
- * the first memory segment with len==0 is the one that
- * follows the last valid segment.
- */
- if (mcfg->memseg[s].len == 0)
- break;
-
- /* get identical addresses as the primary process.
- */
-#ifdef RTE_ARCH_PPC_64
- mmap_flags |= MAP_HUGETLB;
-#endif
- mmap_sz = mcfg->memseg[s].len;
- base_addr = eal_get_virtual_area(mcfg->memseg[s].addr,
- &mmap_sz, mcfg->memseg[s].hugepage_sz, 0,
- mmap_flags);
- if (base_addr == NULL) {
- max_seg = s;
- if (rte_errno == EADDRNOTAVAIL) {
- RTE_LOG(ERR, EAL, "Could not mmap %zu bytes at [%p] - please use '--base-virtaddr' option\n",
- mcfg->memseg[s].len,
- mcfg->memseg[s].addr);
- } else {
- RTE_LOG(ERR, EAL, "Could not mmap %zu bytes at [%p]: '%s'\n",
- mcfg->memseg[s].len,
- mcfg->memseg[s].addr,
- rte_strerror(rte_errno));
- }
- if (aslr_enabled() > 0) {
- RTE_LOG(ERR, EAL, "It is recommended to "
- "disable ASLR in the kernel "
- "and retry running both primary "
- "and secondary processes\n");
- }
- goto error;
- }
- }
-
size = getFileSize(fd_hugepage);
hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
if (hp == MAP_FAILED) {
@@ -1346,46 +1654,49 @@ eal_legacy_hugepage_attach(void)
num_hp = size / sizeof(struct hugepage_file);
RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
- s = 0;
- while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
- void *addr, *base_addr;
- uintptr_t offset = 0;
- size_t mapping_size;
- /*
- * free previously mapped memory so we can map the
- * hugepages into the space
- */
- base_addr = mcfg->memseg[s].addr;
- munmap(base_addr, mcfg->memseg[s].len);
-
- /* find the hugepages for this segment and map them
- * we don't need to worry about order, as the server sorted the
- * entries before it did the second mmap of them */
- for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
- if (hp[i].memseg_id == (int)s){
- fd = open(hp[i].filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- hp[i].filepath);
- goto error;
- }
- mapping_size = hp[i].size;
- addr = mmap(RTE_PTR_ADD(base_addr, offset),
- mapping_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd); /* close file both on success and on failure */
- if (addr == MAP_FAILED ||
- addr != RTE_PTR_ADD(base_addr, offset)) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- hp[i].filepath);
- goto error;
- }
- offset+=mapping_size;
- }
+ /* map all segments into memory to make sure we get the addrs. the
+ * segments themselves are already in memseg list (which is shared and
+ * has its VA space already preallocated), so we just need to map
+ * everything into correct addresses.
+ */
+ for (i = 0; i < num_hp; i++) {
+ struct hugepage_file *hf = &hp[i];
+ size_t map_sz = hf->size;
+ void *map_addr = hf->final_va;
+ struct flock lck;
+
+ /* if size is zero, no more pages left */
+ if (map_sz == 0)
+ break;
+
+ fd = open(hf->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto error;
}
- RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
- (unsigned long long)mcfg->memseg[s].len);
- s++;
+
+ map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, 0);
+ if (map_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto error;
+ }
+
+ /* set shared lock on the file. */
+ lck.l_type = F_RDLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = map_sz;
+ if (fcntl(fd, F_SETLK, &lck) == -1) {
+ RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ goto error;
+ }
+
+ close(fd);
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
@@ -1393,8 +1704,15 @@ eal_legacy_hugepage_attach(void)
return 0;
error:
- for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
- munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+ /* map all segments into memory to make sure we get the addrs */
+ cur_seg = 0;
+ for (cur_seg = 0; cur_seg < i; cur_seg++) {
+ struct hugepage_file *hf = &hp[i];
+ size_t map_sz = hf->size;
+ void *map_addr = hf->final_va;
+
+ munmap(map_addr, map_sz);
+ }
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
if (fd_hugepage >= 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index c1f0f87..5101c04 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -908,7 +908,8 @@ vfio_get_group_no(const char *sysfs_base,
}
static int
-type1_map(const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
@@ -1021,7 +1022,8 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
}
static int
-vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg)
+vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
@@ -1034,7 +1036,8 @@ struct spapr_walk_param {
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg)
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 0f542b1..23b339e 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -25,7 +25,6 @@ DPDK_2.0 {
rte_eal_devargs_type_count;
rte_eal_get_configuration;
rte_eal_get_lcore_state;
- rte_eal_get_physmem_layout;
rte_eal_get_physmem_size;
rte_eal_has_hugepages;
rte_eal_hpet_init;
@@ -241,7 +240,9 @@ EXPERIMENTAL {
rte_malloc_dump_heaps;
rte_mem_iova2virt;
rte_mem_virt2memseg;
+ rte_mem_virt2memseg_list;
rte_memseg_contig_walk;
+ rte_memseg_list_walk;
rte_memseg_walk;
rte_mp_action_register;
rte_mp_action_unregister;
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 9731d4c..103c015 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -100,12 +100,12 @@ static unsigned optimize_object_size(unsigned obj_size)
}
static int
-find_min_pagesz(const struct rte_memseg *ms, void *arg)
+find_min_pagesz(const struct rte_memseg_list *msl, void *arg)
{
size_t *min = arg;
- if (ms->hugepage_sz < *min)
- *min = ms->hugepage_sz;
+ if (msl->page_sz < *min)
+ *min = msl->page_sz;
return 0;
}
@@ -115,11 +115,12 @@ get_min_page_size(void)
{
size_t min_pagesz = SIZE_MAX;
- rte_memseg_walk(find_min_pagesz, &min_pagesz);
+ rte_memseg_list_walk(find_min_pagesz, &min_pagesz);
return min_pagesz == SIZE_MAX ? (size_t) getpagesize() : min_pagesz;
}
+
static void
mempool_add_elem(struct rte_mempool *mp, void *obj, rte_iova_t iova)
{
diff --git a/test/test/test_malloc.c b/test/test/test_malloc.c
index 28c241f..4b5abb4 100644
--- a/test/test/test_malloc.c
+++ b/test/test/test_malloc.c
@@ -12,6 +12,7 @@
#include <rte_common.h>
#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
@@ -706,36 +707,20 @@ err_return:
}
static int
-check_socket_mem(const struct rte_memseg *ms, void *arg)
+check_socket_mem(const struct rte_memseg_list *msl, void *arg)
{
int32_t *socket = arg;
- return *socket == ms->socket_id;
+ return *socket == msl->socket_id;
}
/* Check if memory is available on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- return rte_memseg_walk(check_socket_mem, &socket);
+ return rte_memseg_list_walk(check_socket_mem, &socket);
}
-struct walk_param {
- void *addr;
- int32_t socket;
-};
-static int
-find_socket(const struct rte_memseg *ms, void *arg)
-{
- struct walk_param *param = arg;
-
- if (param->addr >= ms->addr &&
- param->addr < RTE_PTR_ADD(ms->addr, ms->len)) {
- param->socket = ms->socket_id;
- return 1;
- }
- return 0;
-}
/*
* Find what socket a memory address is on. Only works for addresses within
@@ -744,10 +729,9 @@ find_socket(const struct rte_memseg *ms, void *arg)
static int32_t
addr_to_socket(void * addr)
{
- struct walk_param param = {.addr = addr, .socket = 0};
- if (rte_memseg_walk(find_socket, &param) > 0)
- return param.socket;
- return -1;
+ const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
+ return ms == NULL ? -1 : ms->socket_id;
+
}
/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */
diff --git a/test/test/test_memory.c b/test/test/test_memory.c
index c9b287c..b96bca7 100644
--- a/test/test/test_memory.c
+++ b/test/test/test_memory.c
@@ -5,8 +5,11 @@
#include <stdio.h>
#include <stdint.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
#include <rte_memory.h>
#include <rte_common.h>
+#include <rte_memzone.h>
#include "test.h"
@@ -23,12 +26,13 @@
*/
static int
-check_mem(const struct rte_memseg *ms, void *arg __rte_unused)
+check_mem(const struct rte_memseg_list *msl __rte_unused,
+ const struct rte_memseg *ms, void *arg __rte_unused)
{
volatile uint8_t *mem = (volatile uint8_t *) ms->addr;
- size_t i;
+ size_t i, max = ms->len;
- for (i = 0; i < ms->len; i++, mem++)
+ for (i = 0; i < max; i++, mem++)
*mem;
return 0;
}
diff --git a/test/test/test_memzone.c b/test/test/test_memzone.c
index cbf0cfa..0046f04 100644
--- a/test/test/test_memzone.c
+++ b/test/test/test_memzone.c
@@ -111,17 +111,17 @@ struct walk_arg {
int hugepage_16GB_avail;
};
static int
-find_available_pagesz(const struct rte_memseg *ms, void *arg)
+find_available_pagesz(const struct rte_memseg_list *msl, void *arg)
{
struct walk_arg *wa = arg;
- if (ms->hugepage_sz == RTE_PGSIZE_2M)
+ if (msl->page_sz == RTE_PGSIZE_2M)
wa->hugepage_2MB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_1G)
+ if (msl->page_sz == RTE_PGSIZE_1G)
wa->hugepage_1GB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_16M)
+ if (msl->page_sz == RTE_PGSIZE_16M)
wa->hugepage_16MB_avail = 1;
- if (ms->hugepage_sz == RTE_PGSIZE_16G)
+ if (msl->page_sz == RTE_PGSIZE_16G)
wa->hugepage_16GB_avail = 1;
return 0;
@@ -138,7 +138,7 @@ test_memzone_reserve_flags(void)
memset(&wa, 0, sizeof(wa));
- rte_memseg_walk(find_available_pagesz, &wa);
+ rte_memseg_list_walk(find_available_pagesz, &wa);
hugepage_2MB_avail = wa.hugepage_2MB_avail;
hugepage_1GB_avail = wa.hugepage_1GB_avail;