LINUX.ORG.RU

Оптимизировать запрос в котором группируется большое кол-во данных

 ,


0

2

Иcпользую СУБД PostgreSQL

Запрос

 WITH         
		ar AS(
		    SELECT *
		    FROM "ActualDocuments"
		    WHERE 
			"Account" = ANY('{3344433}'::bigint[]) AND
			"MinimalDate" < '2021-09-20'::date 			 
		),      
        user_null AS(
            SELECT
                   DISTINCT
                pp."Account" AS "Account",
                pp."User" AS "Responsible",
                pp."Event" AS "Event",
                array_agg(DISTINCT pp."Subscriber") AS "Subscriber",
                MIN(pp."EndDate") AS "MinimalDate"
            FROM "Documents" pp           
                INNER JOIN ar ON (
			        pp."Account" = ar."Account" AND
				    pp."Event" = ar."Event"
			    )          
            WHERE               
                pp."User" IS NULL                
            GROUP BY pp."Account", pp."User", pp."Event"
        ),
        user_not_null AS(
            SELECT DISTINCT
                pp."Account" AS "Account",
                pp."User" AS "Responsible",
                pp."Event" AS "Event",
                array_agg(DISTINCT pp."Subscriber") AS "Subscriber",
                MIN(pp."EndDate") AS "MinimalDate"
            FROM "Documents" pp         
                INNER JOIN ar ON (
			        pp."Account" = ar."Account" AND
				    pp."Event" = ar."Event"
			    )
            WHERE                
                pp."User" IS NOT NULL                
            GROUP BY pp."Account", pp."User", pp."Event"
        )
		SELECT *
		FROM user_not_null 
		UNION
		SELECT *
		FROM user_null

План

"HashAggregate  (cost=29881.28..29915.65 rows=3437 width=64) (actual time=4117.404..4151.053 rows=107270 loops=1)"
"  Group Key: user_not_null."Account", user_not_null."Responsible", user_not_null."Event", user_not_null."Subscriber", user_not_null."MinimalDate""
"  Buffers: shared hit=735944 dirtied=3, temp read=6722 written=8057"
"  CTE ar"
"    ->  Bitmap Heap Scan on "ActualDocuments"  (cost=36.30..1993.75 rows=465 width=73) (actual time=0.216..1.173 rows=120 loops=1)"
"          Recheck Cond: (("Account" = ANY ('{3344433}'::bigint[])) AND ("MinimalDate" IS NOT NULL))"
"          Filter: ("MinimalDate" < '2021-09-20'::date)"
"          Rows Removed by Filter: 119"
"          Heap Blocks: exact=277"
"          Buffers: shared hit=286"
"          ->  Bitmap Index Scan on "iMinimalDate"  (cost=0.00..36.18 rows=519 width=0) (actual time=0.171..0.172 rows=883 loops=1)"
"                Index Cond: ("Account" = ANY ('{3344433}'::bigint[]))"
"                Buffers: shared hit=9"
"  CTE user_null"
"    ->  HashAggregate  (cost=13603.55..13633.30 rows=2975 width=64) (actual time=1718.383..1718.436 rows=68 loops=1)"
"          Group Key: pp."Account", pp."User", pp."Event", array_agg(DISTINCT pp."Subscriber"), min(pp."EndDate")"
"          Buffers: shared hit=308557 dirtied=3, temp read=3060 written=3080"
"          ->  GroupAggregate  (cost=13484.55..13566.36 rows=2975 width=64) (actual time=710.150..1714.046 rows=68 loops=1)"
"                Group Key: pp."Account", pp."User", pp."Event""
"                Buffers: shared hit=308557 dirtied=3, temp read=3060 written=3080"
"                ->  Sort  (cost=13484.55..13491.99 rows=2975 width=37) (actual time=701.077..778.699 rows=343548 loops=1)"
"                      Sort Key: pp."Account", pp."User", pp."Event""
"                      Sort Method: external merge  Disk: 11032kB"
"                      Buffers: shared hit=308557 dirtied=3, temp read=3060 written=3080"
"                      ->  Nested Loop  (cost=0.56..13312.91 rows=2975 width=37) (actual time=0.037..418.331 rows=343548 loops=1)"
"                            Buffers: shared hit=308557 dirtied=3"
"                            ->  CTE Scan on ar  (cost=0.00..9.30 rows=465 width=12) (actual time=0.001..0.143 rows=120 loops=1)"
"                            ->  Index Scan using "iUserNull" on "Documents" pp  (cost=0.56..28.55 rows=6 width=37) (actual time=0.012..3.119 rows=2863 loops=120)"
"                                  Index Cond: (("Account" = ar."Account") AND ("Event" = ar."Event"))"
"                                  Buffers: shared hit=308557 dirtied=3"
"  CTE user_not_null"
"    ->  Unique  (cost=14101.23..14108.16 rows=462 width=64) (actual time=2247.025..2294.759 rows=107202 loops=1)"
"          Buffers: shared hit=427387, temp read=3662 written=3677"
"          ->  Sort  (cost=14101.23..14102.38 rows=462 width=64) (actual time=2247.023..2260.200 rows=107202 loops=1)"
"                Sort Key: pp_1."Account", pp_1."User", pp_1."Event", (array_agg(DISTINCT pp_1."Subscriber")), (min(pp_1."EndDate"))"
"                Sort Method: external sort  Disk: 8032kB"
"                Buffers: shared hit=427387, temp read=3662 written=3677"
"                ->  GroupAggregate  (cost=14068.07..14080.78 rows=462 width=64) (actual time=1712.571..2077.099 rows=107202 loops=1)"
"                      Group Key: pp_1."Account", pp_1."User", pp_1."Event""
"                      Buffers: shared hit=427387, temp read=2658 written=2673"
"                      ->  Sort  (cost=14068.07..14069.23 rows=462 width=37) (actual time=1712.535..1783.618 rows=237252 loops=1)"
"                            Sort Key: pp_1."Account", pp_1."User", pp_1."Event""
"                            Sort Method: external merge  Disk: 11200kB"
"                            Buffers: shared hit=427387, temp read=2658 written=2673"
"                            ->  Nested Loop  (cost=0.56..14047.62 rows=462 width=37) (actual time=0.258..1412.308 rows=237252 loops=1)"
"                                  Buffers: shared hit=427387"
"                                  ->  CTE Scan on ar ar_1  (cost=0.00..9.30 rows=465 width=12) (actual time=0.218..1.511 rows=120 loops=1)"
"                                        Buffers: shared hit=286"
"                                  ->  Index Scan using "iUser" on "Documents" pp_1  (cost=0.56..30.18 rows=1 width=37) (actual time=0.235..11.441 rows=1977 loops=120)"
"                                        Index Cond: (("Account" = ar_1."Account") AND ("Event" = ar_1."Event"))"
"                                        Buffers: shared hit=427101"
"  ->  Append  (cost=0.00..103.11 rows=3437 width=64) (actual time=2247.028..4050.176 rows=107270 loops=1)"
"        Buffers: shared hit=735944 dirtied=3, temp read=6722 written=8057"
"        ->  CTE Scan on user_not_null  (cost=0.00..9.24 rows=462 width=64) (actual time=2247.027..2320.721 rows=107202 loops=1)"
"              Buffers: shared hit=427387, temp read=3662 written=4678"
"        ->  CTE Scan on user_null  (cost=0.00..59.50 rows=2975 width=64) (actual time=1718.393..1721.813 rows=68 loops=1)"
"              Buffers: shared hit=308557 dirtied=3, temp read=3060 written=3379"
"Planning time: 0.961 ms"
"Execution time: 4164.426 ms"

Для некоторых аккаунтов получается большое количество записей, которые нужно сгруппировать, из-за этого запрос долго выполняется по времени и shared hit memory. У меня идей не осталось



Последнее исправление: polin11 (всего исправлений: 1)

Ну как минимум ты можешь не делать два подзапроса user_null и user_not_null, а сделать один с тем же DISTINCT. У тебя же запросы одинаковые. Да и ar можно не делать подзапросом, а просто джоином ограничиться. Что-то типа:

            SELECT DISTINCT
                pp."Account" AS "Account",
                pp."User" AS "Responsible",
                pp."Event" AS "Event",
                array_agg(DISTINCT pp."Subscriber") AS "Subscriber",
                MIN(pp."EndDate") AS "MinimalDate"
            FROM "Documents" pp         
                INNER JOIN "ActualDocuments" as ar ON (
			        pp."Account" = ar."Account" AND
				    pp."Event" = ar."Event"
			    )
            WHERE                
                ar."Account" = ANY('{3344433}'::bigint[]) AND
                ar."MinimalDate" < '2021-09-20'::date
            GROUP BY pp."Account", pp."User", pp."Event"

Так три запроса схлопываются в один

Aswed ★★★★★
()
Последнее исправление: Aswed (всего исправлений: 1)
Ответ на: комментарий от Aswed

можно пойти дальше и заменить inner join на exist(select) да и DISTINCT убрать аля

SELECT 
                pp."Account" AS "Account",
                pp."User" AS "Responsible",
                pp."Event" AS "Event",
                array_agg(DISTINCT pp."Subscriber") AS "Subscriber",
                MIN(pp."EndDate") AS "MinimalDate"
            FROM "Documents" pp    
where exists(select 1 from ActualDocuments ad where pp."Account" = ad."Account" AND pp."Event" = ad."Event" and ad."Account" = ANY('{3344433}'::bigint[]) AND ad."MinimalDate" < '2021-09-20'::date)
GROUP BY pp."Account", pp."User", pp."Event"

vtVitus ★★★★★
()
Ответ на: комментарий от vtVitus

а дальше выкинуть ещё условия и упростить к:

SELECT 
                pp."Account" AS "Account",
                pp."User" AS "Responsible",
                pp."Event" AS "Event",
                array_agg(DISTINCT pp."Subscriber") AS "Subscriber",
                MIN(pp."EndDate") AS "MinimalDate"
            FROM "Documents" pp    
WHERE pp."Account" = ANY('{3344433}'::bigint[]) and 
      pp."Event" in (select ad."Event" from ActualDocuments ad where ad."Account" = ANY('{3344433}'::bigint[]) and ad."MinimalDate" < '2021-09-20'::date)
GROUP BY pp."Account", pp."User", pp."Event"


как-то так.
vtVitus ★★★★★
()
Вы не можете добавлять комментарии в эту тему. Тема перемещена в архив.