Finding the no of files and directories in a path - Mailing list pgsql-novice

From Nipuna
Subject Finding the no of files and directories in a path
Date
Msg-id CAPCz1_0JYiqSbO9CCpuqZsAMPq4LH9C+Mw6nQguoRGZQHE+eAg@mail.gmail.com
Whole thread Raw
List pgsql-novice
Hi,

I have a table with 45 million records.I have 3 columns say, file_size, kb_alloc, misc_attr

The misc_attr column is the path. The sample record is given below

"57971";     "65536";           "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/respell.map.spelling"
"851";        "8192";             "FAu -- /gpfs/scratch05/data/lm/build_weekly/official/mobi/eng-USA/messaging/large/2011_08_21/clusters/models/field/lexfiles/rewriteForm.map"
"8192"         "16384";         "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/txt/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192"       "16384"             "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/log/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"8192"       "16384"            "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/NVI/PT/MP14k/TEST_SAMPLE_SUPPRESSED_DIGITS_datapack/2p/outputlnk/recog/time/58ABB8B7-C44C11DF-A73A949F-7B4DBCC0_CALLER"
"4487";       "8192";            "FAu -- /gpfs/scratch05/data/lm/scratch/yi_su/numbered-list/data.old/gpfs/static02/data/lm/data/english/text/scansoft/english-us/medical/transcend/90/90.47915.2000.UNKNOWN.txt.in.gz"
"8192";        "16384";         "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-12.00.300"
"8192"        "16384"            "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.00.100"
"7"             "8192"              "FAu -- /gpfs/scratch05/data/lm/scratch/sophie_charignon/plato/jpn/.hg/undo.branch"
"8192"       "16384"              "D2u -- /gpfs/scratch05/data/lm/scratch/alfred_dielmann/DNS_VOC/legal/eng-GBR/BUILDS/BASELINE/lm/build/vocabs/natspeak/eng-GBR/legal/large/2012_04_20/plato/eng/build-bin-13.02.100"



I want a table which gives me the directory count and file count and their file size.My table should look like this;

                      dir_path                                                                         kb_alloc    USED_SIZE   folder_count     file_count
"/gpfs/scratch05/data/lm/build_weekly/vocabs";                                          49152;       37249;             9;                      1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak";                            49152;       37249;             8;                      1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR";              49152;       37249;            7;                       1
"/gpfs/scratch05/data/lm/build_weekly/vocabs/natspeak/eng-GBR/legal";      49152;       37249;             6;                      1
"/gpfs/scratch05/data/lm/scratch/erik_larsson";                                          2093824;    21783001;        59;                   25
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep";                                    21651456;    21620619;     30;                    7
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds";                           21602304 ;  1572331;        22;                    6
"/gpfs/scratch05/data/lm/scratch/erik_larsson/zep/builds/lm";                       21602304;   21572331;     21;                     6

 I have written a function to create the table in this format. But a per the calculation it might the take 600+ hours to return the result. Is there any way I could optimize the function I have written. Below is the function I have written;

-- Function: hierarchial_file_structure()

-- DROP FUNCTION hierarchial_file_structure();

CREATE OR REPLACE FUNCTION hierarchial_file_structure()
  RETURNS void AS
$BODY$

DECLARE

file_size1      BIGINT;
folder_count1     integer;
kb_alloc1       BIGINT;   
misc_attr1      TEXT;
folder_names       TEXT ARRAY;
path_names       TEXT ARRAY;
current_path     TEXT;
isNewRow     BOOLEAN;
mFILES          RECORD;
DataType     TEXT;
temp_count     BIGINT;
sqlcount     bigint;

BEGIN

    DELETE FROM file_structure;
 
    sqlcount :=0;
   
    FOR mFILES IN SELECT CAST(coalesce(file_size, '0') AS bigint) as file_size, CAST(coalesce(kb_alloc, '0') AS bigint) as kb_alloc, misc_attr FROM storage where (substring(misc_attr,1,1)) in ('F','D')
    LOOP

        file_size1     := mFILES.file_size;       
        kb_alloc1     := mFILES.kb_alloc;       
        misc_attr1     := mFILES.misc_attr;       

        DataType      :=(substring(misc_attr1,1,1));
        folder_names     := regexp_split_to_array(misc_attr1, '/');
        current_path     := '';

        -- LOOP THROUGH FOLDER NAMES AND CREATE THE PATH NAMES
        IF DataType = 'F' THEN
            -- IGNORE THE LAST ELEMENT SINCE ITS THE FILE NAME
           
            FOR iFolderArrayCount IN 2..(array_length(folder_names,1)-1)
            LOOP

                current_path := current_path || '/' || folder_names[iFolderArrayCount];
               
                path_names[iFolderArrayCount-1] := current_path;
               
            END LOOP;
        ELSE

            FOR iFolderArrayCount IN 2..(array_length(folder_names,1))
            LOOP

                current_path := current_path || '/' || folder_names[iFolderArrayCount];
                path_names[iFolderArrayCount-1] := current_path;
               
            END LOOP;
        END IF;

        --LOOP THROUH THE PATH NAMES

        FOR iPathArrayCount IN 1..(array_length(path_names,1))
        LOOP

            temp_count := array_length(path_names,1) - iPathArrayCount;
           
            select into folder_count1 count(*) from file_structure where dir_path = ANY(path_names[iPathArrayCount+1:array_length(path_names,1)]);

            folder_count1 := temp_count - folder_count1;
            current_path := path_names[iPathArrayCount];

           
            IF EXISTS(select distinct(dir_path) from file_structure where dir_path = current_path) THEN

                --Row Exists hence update the count using the count of folders that does not exist in the summary table
               
                IF(DataType = 'F') THEN
                   
                    update file_structure set
                        kb_alloc = kb_alloc + kb_alloc1,
                        used_size = used_size+ file_size1,
                        file_count = file_count +1,
                        folder_count = folder_count+ folder_count1
                    where dir_path = current_path;

                ELSE
                    update file_structure set
                            kb_alloc = kb_alloc + kb_alloc1,
                            used_size = used_size+ file_size1,
                            folder_count = folder_count+folder_count1
                        where dir_path = current_path;

                END IF;
            ELSE
                               
                IF(DataType = 'F') THEN
                   
                    INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
                        (current_path,kb_alloc1, file_size1,folder_count1,1);
                ELSE
                    INSERT INTO file_structure (dir_path,kb_alloc,used_size, folder_count,file_count) VALUES
                        (current_path,kb_alloc1, file_size1,folder_count1,0);
                END IF;
                   
            END IF;
        END LOOP;
    END LOOP;

END;

$BODY$
  LANGUAGE plpgsql VOLATILE
  COST 100;
ALTER FUNCTION hierarchial_file_structure()
  OWNER TO postgres;


Please let me know if there is a better way to write this function.Thanks in advance


--
Regards,
Nipuna
 

pgsql-novice by date:

Previous
From: Tom Lane
Date:
Subject: Re: Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1
Next
From: Zach Seaman
Date:
Subject: Re: [NOVICE] Re: [NOVICE] Re: [NOVICE] Problems with ñ and tildes / CSV import problems in PostgreSQL 9.1