ansaurus

Question

Check if the PDF document is blank in C#

Answer 1

A:

com.itextpdf.text.pdf.parser.PdfContentReaderTool

http://www.java2s.com/Open-Source/CSharp/PDF/iTextSharp/iTextSharp/text/pdf/parser/PdfContentReaderTool.cs.htm

Seems this will return 'outp.WriteLine("No text found on page " + pageNum);' for all pages or 0/1 :)

I think this can help you...hopefully.

CODE
namespace iTextSharp.text.pdf.parser{

    /**
     * Tool that parses the content of a PDF document.
     * @since   2.1.4
     */
    public class PdfContentReaderTool {

        /**
         * Shows the detail of a dictionary.
         * This is similar to the PdfLister functionality.
         * @param dic   the dictionary of which you want the detail
         * @return  a String representation of the dictionary
         */
        public static String GetDictionaryDetail(PdfDictionary dic){
            return GetDictionaryDetail(dic, 0);
        }

        /**
         * Shows the detail of a dictionary.
         * @param dic   the dictionary of which you want the detail
         * @param depth the depth of the current dictionary (for nested dictionaries)
         * @return  a String representation of the dictionary
         */
        public static  String GetDictionaryDetail(PdfDictionary dic, int depth){
            StringBuilder builder = new StringBuilder();
            builder.Append('(');
            IList<PdfName> subDictionaries = new List<PdfName>();
            foreach (PdfName key in dic.Keys) {
                PdfObject val = dic.GetDirectObject(key);
                if (val.IsDictionary())
                    subDictionaries.Add(key);
                builder.Append(key);
                builder.Append('=');
                builder.Append(val);
                builder.Append(", ");
            }
            builder.Length = builder.Length-2;
            builder.Append(')');
            foreach (PdfName pdfSubDictionaryName in subDictionaries) {
                builder.Append('\n');
                for (int i = 0; i < depth+1; i++){
                    builder.Append('\t');
                }
                builder.Append("Subdictionary ");
                builder.Append(pdfSubDictionaryName);
                builder.Append(" = ");
                builder.Append(GetDictionaryDetail(dic.GetAsDict(pdfSubDictionaryName), depth+1));
            }
            return builder.ToString();
        }

        /**
         * Displays a summary of the entries in the XObject dictionary for the stream
         * @param resourceDic the resource dictionary for the stream
         * @return a string with the summary of the entries
         * @throws IOException
         * @since 5.0.2
         */
        public static String GetXObjectDetail(PdfDictionary resourceDic) {
            StringBuilder sb = new StringBuilder();

            PdfDictionary xobjects = resourceDic.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
                return "No XObjects";
            foreach (PdfName entryName in xobjects.Keys) {
                PdfStream xobjectStream = xobjects.GetAsStream(entryName);

                sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + " = " + xobjectStream.GetAsNumber(PdfName.LENGTH) + " bytes ------\n");

                if (!xobjectStream.Get(PdfName.SUBTYPE).Equals(PdfName.IMAGE)){

                    byte[] contentBytes = ContentByteUtils.GetContentBytesFromContentObject(xobjectStream);

                    foreach (byte b in contentBytes) {
                        sb.Append((char)b);
                    }

                    sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + "End of Content" + "------\n");
                }
            }

            return sb.ToString();
        }

        /**
         * Writes information about a specific page from PdfReader to the specified output stream.
         * @since 2.1.5
         * @param reader    the PdfReader to read the page content from
         * @param pageNum   the page number to read
         * @param out       the output stream to send the content to
         * @throws IOException
         */
        public static void ListContentStreamForPage(PdfReader reader, int pageNum, TextWriter outp) {
            outp.WriteLine("==============Page " + pageNum + "====================");
            outp.WriteLine("- - - - - Dictionary - - - - - -");
            PdfDictionary pageDictionary = reader.GetPageN(pageNum);
            outp.WriteLine(GetDictionaryDetail(pageDictionary));

            outp.WriteLine("- - - - - XObject Summary - - - - - -");
            outp.WriteLine(GetXObjectDetail(pageDictionary.GetAsDict(PdfName.RESOURCES)));

            outp.WriteLine("- - - - - Content Stream - - - - - -");
            RandomAccessFileOrArray f = reader.SafeFile;

            byte[] contentBytes = reader.GetPageContent(pageNum, f);
            f.Close();

            outp.Flush();

            foreach (byte b in contentBytes) {
                outp.Write((char)b);
            }

            outp.Flush();

            outp.WriteLine("- - - - - Text Extraction - - - - - -");
            String extractedText = PdfTextExtractor.GetTextFromPage(reader, pageNum, new LocationTextExtractionStrategy());
            if (extractedText.Length != 0)
                outp.WriteLine(extractedText);
            else
                outp.WriteLine("No text found on page " + pageNum);

            outp.WriteLine();

        }

        /**
         * Writes information about each page in a PDF file to the specified output stream.
         * @since 2.1.5
         * @param pdfFile   a File instance referring to a PDF file
         * @param out       the output stream to send the content to
         * @throws IOException
         */
        public static void ListContentStream(string pdfFile, TextWriter outp) {
            PdfReader reader = new PdfReader(pdfFile);

            int maxPageNum = reader.NumberOfPages;

            for (int pageNum = 1; pageNum <= maxPageNum; pageNum++){
                ListContentStreamForPage(reader, pageNum, outp);
            }

        }

        /**
         * Writes information about the specified page in a PDF file to the specified output stream.
         * @since 2.1.5
         * @param pdfFile   a File instance referring to a PDF file
         * @param pageNum   the page number to read
         * @param out       the output stream to send the content to
         * @throws IOException
         */
        public static void ListContentStream(string pdfFile, int pageNum, TextWriter outp) {
            PdfReader reader = new PdfReader(pdfFile);

            ListContentStreamForPage(reader, pageNum, outp);
        }

        /**
         * Writes information about each page in a PDF file to the specified file, or System.out.
         * @param args
         */
        public static void Main(String[] args) {
            try{
                if (args.Length < 1 || args.Length > 3){
                    Console.WriteLine("Usage:  PdfContentReaderTool <pdf file> [<output file>|stdout] [<page num>]");
                    return;
                }

                TextWriter writer = Console.Out;
                if (args.Length >= 2){
                    if (!Util.EqualsIgnoreCase(args[1], "stdout")) {
                        Console.WriteLine("Writing PDF content to " + args[1]);
                        writer = new StreamWriter(args[1]);
                    }
                }

                int pageNum = -1;
                if (args.Length >= 3){
                    pageNum = int.Parse(args[2]);
                }

                if (pageNum == -1){
                    ListContentStream(args[0], writer);
                } else {
                    ListContentStream(args[0], pageNum, writer);
                }
                writer.Flush();

                if (args.Length >= 2){
                    writer.Close();
                    Console.WriteLine("Finished writing content to " + args[1]);
                }
            } catch (Exception e){
                Console.WriteLine(e.ToString());
            }
        }
    }
}
/CODE

Joe Garrett 2010-07-14 00:05:21

ansaurus

tags:

views:

answers:

Check if the PDF document is blank in C#

related questions