| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
cymt
10年前发布

C# 用 iTextSharp 将 PDF 转成文本

using System;  using System.IO;  using iTextSharp.text;  using iTextSharp.text.pdf;  using iTextSharp.text.pdf.parser;     public class ParsingPDF {          static string PDF;      static string TEXT2;          /**       * Parses the PDF using PRTokeniser       * @param src  the path to the original PDF file       * @param dest the path to the resulting text file       */      public void parsePdf(String src, String dest)      {          PdfReader reader = new PdfReader(src);          StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));          int pageCount = reader.NumberOfPages;          for (int pg = 1; pg <= pageCount; pg++)          {              // we can inspect the syntax of the imported page              byte[] streamBytes = reader.GetPageContent(pg);              PRTokeniser tokenizer = new PRTokeniser(streamBytes);              while (tokenizer.NextToken())              {                  if (tokenizer.TokenType == PRTokeniser.TokType.STRING)                  {                      output.WriteLine(tokenizer.StringValue);                  }              }          }          output.Flush();          output.Close();      }          /**       * Main method.       */      static void Main(string[] args)      {          if (args.Length < 1 || args.Length > 2)          {              Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");              return;          }          else if (args.Length == 1)          {              PDF = args[0];              TEXT2 = Path.GetFileNameWithoutExtension(PDF) + ".txt";          }          else          {              PDF = args[0];              TEXT2 = args[1];          }             try          {              DateTime t1 = DateTime.Now;                 ParsingPDF example = new ParsingPDF();              example.parsePdf(PDF, TEXT2);                 DateTime t2 = DateTime.Now;              TimeSpan ts = t2 - t1;              Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);          }          catch (Exception ex)          {              Console.WriteLine("ERROR: " + ex.Message);          }      } // class         public class MyTextRenderListener : IRenderListener      {          /** The print writer to which the information will be written. */          protected StreamWriter output;             /**           * Creates a RenderListener that will look for text.           */          public MyTextRenderListener(StreamWriter output)          {              this.output = output;          }             public void BeginTextBlock()          {              output.Write("<");          }             public void EndTextBlock()          {              output.WriteLine(">");          }             public void RenderImage(ImageRenderInfo renderInfo)          {          }             public void RenderText(TextRenderInfo renderInfo)          {              output.Write("<");              output.Write(renderInfo.GetText());              output.Write(">");          }      } // class  } // namespace