История изменений

Исправление post-factum, 24.12.20 13:30 (текущая версия) :

Вообще, мне кажется, что всё говно. А всё потому, что мне опять-таки кажется, что точно такой же механизм в ядре уже есть, и работает он следующим образом.

В shrink_node() есть такие строчки:

	/*
	 * Prevent the reclaimer from falling into the cache trap: as
	 * cache pages start out inactive, every cache fault will tip
	 * the scan balance towards the file LRU.  And as the file LRU
	 * shrinks, so does the window for rotation from references.
	 * This means we have a runaway feedback loop where a tiny
	 * thrashing file LRU becomes infinitely more attractive than
	 * anon pages.  Try to detect this based on file LRU size.
	 */
	if (!cgroup_reclaim(sc)) {
		unsigned long total_high_wmark = 0;
		unsigned long free, anon;
		int z;

		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
			   node_page_state(pgdat, NR_INACTIVE_FILE);

		for (z = 0; z < MAX_NR_ZONES; z++) {
			struct zone *zone = &pgdat->node_zones[z];
			if (!managed_zone(zone))
				continue;

			total_high_wmark += high_wmark_pages(zone);
		}

		/*
		 * Consider anon: if that's low too, this isn't a
		 * runaway file reclaim problem, but rather just
		 * extreme pressure. Reclaim as per usual then.
		 */
		anon = node_page_state(pgdat, NR_INACTIVE_ANON);

		sc->file_is_tiny =
			file + free <= total_high_wmark &&
			!(sc->may_deactivate & DEACTIVATE_ANON) &&
			anon >> sc->priority;
	}

Обратите внимание на file_is_tiny. Оно зависит от (внезапно, и какого чёрта вообще) от суммы high watermark’ов всех зон. В моём случае это около 130 МиБ по умолчанию, кстати.

Так вот, потом в том же get_scan_count(), который мы тут сообща насилуем, есть условие:

	/*
	 * If the system is almost out of file pages, force-scan anon.
	 */
	if (sc->file_is_tiny) {
		scan_balance = SCAN_ANON;
		goto out;
	}

А теперь внимание вопрос: что мешает поменять условие для file_is_tiny, чтобы добиться того же, только с менее идиотскими правками в коде и (частично) отвязав его от watermark’ов?

Например, так (псевдокод, не пытайтесь это компилить):

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c017f44960f6..1c2661a49647 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2788,7 +2788,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
 
 		sc->file_is_tiny =
-			file + free <= total_high_wmark &&
+			((file + free <= total_high_wmark) ||
+			 K(file) <= sysctl_unevictable_file_kbytes) &&
 			!(sc->may_deactivate & DEACTIVATE_ANON) &&
 			anon >> sc->priority;
 	}

Исходная версия post-factum, 24.12.20 13:26:

В shrink_node() есть такие строчки:

	/*
	 * Prevent the reclaimer from falling into the cache trap: as
	 * cache pages start out inactive, every cache fault will tip
	 * the scan balance towards the file LRU.  And as the file LRU
	 * shrinks, so does the window for rotation from references.
	 * This means we have a runaway feedback loop where a tiny
	 * thrashing file LRU becomes infinitely more attractive than
	 * anon pages.  Try to detect this based on file LRU size.
	 */
	if (!cgroup_reclaim(sc)) {
		unsigned long total_high_wmark = 0;
		unsigned long free, anon;
		int z;

		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
			   node_page_state(pgdat, NR_INACTIVE_FILE);

		for (z = 0; z < MAX_NR_ZONES; z++) {
			struct zone *zone = &pgdat->node_zones[z];
			if (!managed_zone(zone))
				continue;

			total_high_wmark += high_wmark_pages(zone);
		}

		/*
		 * Consider anon: if that's low too, this isn't a
		 * runaway file reclaim problem, but rather just
		 * extreme pressure. Reclaim as per usual then.
		 */
		anon = node_page_state(pgdat, NR_INACTIVE_ANON);

		sc->file_is_tiny =
			file + free <= total_high_wmark &&
			!(sc->may_deactivate & DEACTIVATE_ANON) &&
			anon >> sc->priority;
	}

Так вот, потом в том же get_scan_count(), который мы тут сообща насилуем, есть условие:

	/*
	 * If the system is almost out of file pages, force-scan anon.
	 */
	if (sc->file_is_tiny) {
		scan_balance = SCAN_ANON;
		goto out;
	}

Например, так (псевдокод, не пытайтесь это компилить):

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c017f44960f6..1c2661a49647 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2788,7 +2788,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
 
 		sc->file_is_tiny =
-			file + free <= total_high_wmark &&
+			((file + free <= total_high_wmark) ||
+			 K(file) <= sysctl_unevictable_activefile_kbytes) &&
 			!(sc->may_deactivate & DEACTIVATE_ANON) &&
 			anon >> sc->priority;
 	}