Dynamic Analyze Configuration and Functions

These parameters control how clients connect and authenticate to Greenplum Database.

DROP TABLE IF EXISTS dynamic_analyze_config;

CREATE TABLE dynamic_analyze_config (

    schemaname TEXT NOT NULL,      -- Text representation of schema name

    tablename TEXT NOT NULL,       -- Text representation of table name

    columns TEXT[],                -- Array of column names represented as text

                                   --   objects listing the subset of columns

                                   --   to analyze; if this field is empty, all

                                   --   columns will be analyzed

    threshold INTERVAL NOT NULL    -- Interval of time after which statistics

                                   --   for this table are considered stale

)

    WITH (APPENDONLY=FALSE)

    DISTRIBUTED RANDOMLY

;

-- This code is provided on an "as is" basis as an example of pothential

-- functionality, without warntees or conditions of any kind, either express or

-- implied, including, without limitation, any warranties or conditions of

-- title, non-infringement, merchantability, or fitness for a particular

-- purpose. As the user of this code you are solely responsible for determining

-- the appropriateness of using this code and mitigating risks associated with

-- it. 

-- Usage forms:

--

--     SELECT * FROM dynamic_analyze_targeted(schema_name, max_analyzes);

--     - Run ANALYZE against at most max_analyzes tables belonging to the schema

--       named by schema_name that are configured in the dynamic_analyze_config

--       table.  If max_analyzes is not a positive integer, no such limit will

--       be observed.  Table analysis will be prioritized based upon how much

--       time has elapsed since each table has been analyzed relative to the

--       staleness threshold configured in dynamic_analyze_config.

--

--     SELECT * FROM dynamic_analyze_targeted(schema_name, max_analyzes, runtime_limit);

--     - Run ANALYZE against tables belonging to the schema named by schema_name

--       that are configured in the dynamic_analyze_config table until either:

--       1) the number of tables that have been analyzed equals/exceeds

--          max_analyzes (assuming max_analyzes is a positive integer) or

--       2) the total runtime after the completion of the most recent ANALYZE

--          has exceeded the runtime_limit parameter (interpreted as an

--          INTERVAL)

--       As with the other usage form, table analysis will be prioritized based

--       upon how much time has elapsed since each table has been analyzed

--       relative to the staleness threshold configured in

--       dynamic_analyze_config.

CREATE OR REPLACE FUNCTION dynamic_analyze_targeted (

schema_name TEXT, size int 

) RETURNS TABLE(

tablename TEXT,

analyze_start TIMESTAMP,

analyze_stop TIMESTAMP,

time_minutes NUMERIC

) AS $$

BEGIN

FOR tablename, analyze_start, analyze_stop, time_minutes IN

SELECT * FROM dynamic_analyze_targeted (

schema_name, size, '-1 SECONDS'

)

LOOP

RETURN NEXT;

END LOOP;

END;

$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION dynamic_analyze_targeted (

schema_name TEXT, size int, runtime_limit_text TEXT

) RETURNS TABLE(

tablename TEXT,

analyze_start TIMESTAMP,

analyze_stop TIMESTAMP,

time_minutes NUMERIC

) AS $$

DECLARE

cmd TEXT; 

runtime_limit INTERVAL;

start_ts TIMESTAMP;

columns TEXT[];

columnstring TEXT;

tablesdone INTEGER;

BEGIN

tablesdone := 0;

start_ts := CLOCK_TIMESTAMP();

runtime_limit := runtime_limit_text::INTERVAL;

FOR tablename, columns IN

SELECT 

n.nspname||'.'||c.relname,

tgts.columns

FROM

-- Assemble a list of all objects known to the catalog

pg_class c INNER JOIN pg_namespace n

ON (c.relnamespace = n.oid)

-- Determine when each object has last been analyzed

LEFT OUTER JOIN (

SELECT DISTINCT

objid,

statime

FROM

pg_stat_last_operation 

WHERE

staactionname in ('ANALYZE')

) a

ON (a.objid = c.oid)

-- Get a list of objects explicitly named in the dynamic_analyze_config table

LEFT OUTER JOIN dynamic_analyze_config tgts ON (

n.nspname = tgts.schemaname AND

c.relname = tgts.tablename

)

-- Consider only tables in the named schema that have never been analyzed, or

-- for whom the timestamp of the last ANALYZE operation is outside of the

-- staleness interval set in dynamic_analyze_config

WHERE

n.nspname = schema_name

AND (

a.statime IS NULL

OR NOW()-a.statime-tgts.threshold > '0 SECONDS'::INTERVAL

)

-- Consider only tables explicitly named in dynamic_analyze_config

AND tgts.tablename IS NOT NULL

-- Prioritize tables whose statistics are more out-of-date

ORDER BY NOW()-a.statime-tgts.threshold DESC

-- Keep running until no tables remain to analyze, until the runtime

-- threshold has been exceeded, or until the table maximum has been hit.

LOOP

EXIT WHEN runtime_limit > '0 SECONDS'::INTERVAL AND CLOCK_TIMESTAMP() - start_ts > runtime_limit;

EXIT WHEN size > 0 AND tablesdone >= size;

-- Build and execute the analyze statement, taking into account whether a column

-- subset was configured in dynamic_analyze_config.

IF columns IS NULL OR columns = '{}' THEN

columnstring := '';

ELSE

columnstring := '('||array_to_string(columns, ',')||')';

END IF;

cmd := 'ANALYZE '||tablename||' '||columnstring||';';

analyze_start := CLOCK_TIMESTAMP();

EXECUTE cmd;

analyze_stop := CLOCK_TIMESTAMP();

tablesdone := tablesdone + 1;

time_minutes := EXTRACT(EPOCH FROM analyze_stop - analyze_start) / 60.0;

RETURN NEXT;

END LOOP;

END;

$$ LANGUAGE plpgsql;

-- This code is provided on an "as is" basis as an example of pothential

-- functionality, without warntees or conditions of any kind, either express or

-- implied, including, without limitation, any warranties or conditions of

-- title, non-infringement, merchantability, or fitness for a particular

-- purpose. As the user of this code you are solely responsible for determining

-- the appropriateness of using this code and mitigating risks associated with

-- it. 

-- Usage forms:

--

--     SELECT * FROM dynamic_analyze_catchall(schema_name, max_analyzes, expiry);

--     - Run ANALYZE against at most max_analyzes tables belonging to the

--       schema named by schema_name that are NOT configured in the

--       dynamic_analyze_config table.  If max_analyzes is not a positive

--       integer, no such limit will be observed.  Tables are prioritized based

--       upon how much time has elapsed since each table has been analyzed

--       relative to the expiry parameter.

--       Each table will only be analyzed if:

--       1) It is not configured in the dynamic_analyze_config table

--       2) The time that has elapsed since the table was last analyzed exceeds

--          the interval specified by the expiry parameter

--       3) If the table is part of a partitioned object, it is a lowest-level

--          child (i.e. the table actually contains data)

--       4) The table is not a staging or swapping table (by naming convention)

--

--     SELECT * FROM dynamic_analyze_catchall(schema_name, max_analyzes, expiry, runtime_limit);

--     - Run ANALYZE against at tables belonging to the schema named by

--       schema_name that are NOT configured in the dynamic_analyze_config

--       table.  Execution terminates after either:

--       1) The number of tables analyzed equals/exceeds the max_analyzes

--          parameter (assuming it is a positive integer)

--       2) The time elapsed since the start of the function execution exceeds

--          the interval specified by the runtime_limit parameter

--       All other constraints on individual tables as noted above apply.

CREATE OR REPLACE FUNCTION dynamic_analyze_catchall (

schema_name TEXT, size int, expiry TEXT

) RETURNS TABLE(

tablename TEXT,

analyze_start TIMESTAMP,

analyze_stop TIMESTAMP,

time_minutes NUMERIC

) AS $$

BEGIN

FOR tablename, analyze_start, analyze_stop, time_minutes IN

SELECT * FROM dynamic_analyze_catchall (

schema_name, size, expiry, '-1 SECONDS'

)

LOOP

RETURN NEXT;

END LOOP;

END;

$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION dynamic_analyze_catchall (

schema_name TEXT, size int, expiry TEXT, runtime_limit_text TEXT

) RETURNS TABLE(

tablename TEXT,

analyze_start TIMESTAMP,

analyze_stop TIMESTAMP,

time_minutes NUMERIC

) AS $$

DECLARE

cmd TEXT; 

runtime_limit INTERVAL;

start_ts TIMESTAMP;

tablesdone INTEGER;

BEGIN

tablesdone := 0;

start_ts := CLOCK_TIMESTAMP();

runtime_limit := runtime_limit_text::INTERVAL;

FOR cmd, tablename IN

SELECT 

'analyze '||n.nspname||'.'||c.relname||';',

n.nspname||'.'||c.relname

FROM

-- Assemble a list of all objects known to the catalog

pg_class c INNER JOIN pg_namespace n

ON (c.relnamespace = n.oid)

LEFT OUTER JOIN pg_partitions pgp

ON (

pgp.schemaname = n.nspname

AND pgp.tablename = c.relname

)

-- Determine when each object has last been analyzed

LEFT OUTER JOIN (

SELECT DISTINCT

objid,

statime

FROM

pg_stat_last_operation 

WHERE

staactionname in ('ANALYZE')

) a

ON (a.objid = c.oid)

-- Get a list of objects explicitly named in the dynamic_analyze_config table

LEFT OUTER JOIN dynamic_analyze_config tgts ON (

n.nspname = tgts.schemaname AND

c.relname = tgts.tablename

)

-- Restrict the list of objects to non-swap and non-staging heap tables in the

-- named schema

WHERE

n.nspname = schema_name

AND c.relname not like 'stg%' and c.relname not like 'swap%'

AND relkind in ('r')

AND c.relstorage = 'h'

-- Consider only tables that have never been analyzed, or for whom the timestamp

-- of the last ANALYZE operation is outside of the specific staleness interval

AND (

a.statime IS NULL

OR a.statime < (NOW()-expiry::INTERVAL)

)

-- If a table is partitioned, ignore the ones that are not at the lowest level

-- of partitioning (and therefore would contain no data)

AND pgp.partitiontype IS NULL

-- Ignore tables explicitly named in dynamic_analyze_config

AND tgts.tablename IS NULL

-- Prioritize each table based upon the relative staleness of its statistics

ORDER BY NOW()-a.statime-expiry::INTERVAL DESC

-- Keep running until no tables remain to analyze, until the runtime

-- threshold has been exceeded, or until the number of tables meets

-- the maximum table count.

LOOP

EXIT WHEN runtime_limit > '0 SECONDS'::INTERVAL AND CLOCK_TIMESTAMP() - start_ts > runtime_limit;

EXIT WHEN size > 0 AND tablesdone >= size;

analyze_start := CLOCK_TIMESTAMP();

EXECUTE cmd;

analyze_stop := CLOCK_TIMESTAMP();

tablesdone := tablesdone + 1;

time_minutes := EXTRACT(EPOCH FROM analyze_stop - analyze_start) / 60.0;

RETURN NEXT;

END LOOP;

END;

$$ LANGUAGE plpgsql;