Smart Analyze Greenplum Tables

Step 1: create config table

[gpadmin@sachi ~]$ psql -d sachi

psql (8.2.15)

Type "help" for help.

sachi=# 

CREATE TABLE dbadmin.smart_analyze_config (

schemaname TEXT NOT NULL,      -- schema name

tablename TEXT NOT NULL,          --  table name

columns TEXT,  -- Comma separated list of columns to be analyzed. Default is distribution key. If this field is RANDOM or empty, all columns will be analyzed

threshold INTERVAL NOT NULL     -- Interval (in hours) of time after which statistics for this table are considered stale

)

WITH (APPENDONLY=FALSE)

DISTRIBUTED RANDOMLY;

Version 2 -> Add relpages and reltuples in the table.

sachi=# 

CREATE TABLE dbadmin.smart_analyze_config (

schemaname TEXT NOT NULL,      -- schema name

tablename TEXT NOT NULL,          --  table name

columns TEXT,  -- Comma separated list of columns to be analyzed. Default is distribution key. If this field is RANDOM or empty, all columns will be analyzed

threshold INTERVAL NOT NULL,     -- Interval (in hours) of time after which statistics for this table are considered stale

relpages integer not null,

reltuples  real not null

)

WITH (APPENDONLY=FALSE)

DISTRIBUTED RANDOMLY;

Step 2: Insert meta data in the table

insert into dbadmin.smart_analyze_config (schemaname,tablename,columns,threshold)

SELECT pgn.nspname , 

pgc.relname , 

pga.attname ,

'24 hours'

FROM 

(SELECT gdp.localoid, 

CASE WHEN ( Array_upper(gdp.attrnums, 1) > 0 ) THEN Unnest(gdp.attrnums) ELSE NULL END AS attnum 

FROM gp_distribution_policy gdp 

ORDER BY gdp.localoid) AS distrokey 

INNER JOIN pg_class AS pgc ON distrokey.localoid = pgc.oid 

INNER JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid 

AND pgn.nspname NOT IN ('gp_toolkit', 'information_schema', 'pg_catalog') 

AND pgc.relkind='r'

AND pgc.relstorage!='x'

LEFT OUTER JOIN pg_attribute pga ON distrokey.attnum = pga.attnum 

AND distrokey.localoid = pga.attrelid 

ORDER BY pgn.nspname, pgc.relname;

Version 2 -> Add relpages and reltuples in the table.

insert into smart_analyze_config (schemaname,tablename,columns,threshold,relpages,reltuples) 

SELECT 

pgn.nspname , 

 pgc.relname , 

 pga.attname , 

 '72 hours', 

 pgc.relpages, 

 pgc.reltuples 

FROM 

 (SELECT gdp.localoid, 

 CASE WHEN ( Array_upper(gdp.attrnums, 1) > 0 ) THEN Unnest(gdp.attrnums) ELSE NULL END AS attnum 

 FROM gp_distribution_policy gdp 

 ORDER BY gdp.localoid) AS distrokey 

 INNER JOIN pg_class AS pgc ON distrokey.localoid = pgc.oid 

 INNER JOIN pg_namespace pgn ON pgc.relnamespace = pgn.oid 

 AND pgn.nspname NOT IN ('gp_toolkit', 'information_schema', 'pg_catalog') 

 AND pgc.relkind='r' AND pgc.relstorage!='x' 

 LEFT OUTER JOIN pg_attribute pga ON distrokey.attnum = pga.attnum 

 AND distrokey.localoid = pga.attrelid ORDER BY pgn.nspname, pgc.relname;

INSERT 0 275

Time: 103.499 ms

Step 3: Create smart analyze SQL and look at the output. make any corrections if you see something not right. 

select schemaname,tablename,lastnalyzed,

CASE

WHEN columns is NULL THEN 'ANALYZE '||'"'||schemaname||'"'||'.'||'"'||tablename||'"'||';'

ELSE 'ANALYZE '||'"'||schemaname||'"'||'.'||'"'||tablename||'"'||'('||columns||')'||';'  END as analyzecommand

 from

 (

 select pso.schemaname,pso.objname as tablename,columns,max(pso.statime) as lastnalyzed,max(sac.threshold) as thresholdinterval

 from pg_stat_operations pso ,smart_analyze_config sac

 where pso.schemaname=sac.schemaname

 and pso.objname=sac.tablename

 and pso.actionname='ANALYZE'

 group by 1,2,3

 order by 3 desc                                                                                                                                              

) as a

where now()-lastnalyzed>thresholdinterval

order by 3 asc;

Step 4: Save above SQL in a file smartanalyze.sql 

Step 6: From the gpadmin login run following command

[gpadmin@sachi scripts]$ psql -d sachi -Alt -f smartanalyze.sql|awk '{ print $8,$9 }'|psql -X -d sachi