Picon

How to index only the pdf content/text

I searched a way to index only the content/text part of a PDF (without all the other fields Tika creates) and I found the “solution” with the "uprefix" = ignored_ and <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="false" />.

 

The problem is, that uprefix works on fields that are not specified in the schema. In my schema I specified two fields (id and rmDocumentTitle) and this two fields are added to the content too (what I will avoid).

 

How can I exclude this two fields to be added to the fullText?

 

Here are my config files:

 

schema.xml

<?xml version="1.0" encoding="UTF-8" ?>

<schema name="simple" version="1.1">

                <types>

                               <fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />

                               <fieldtype name="ignored" class="solr.TextField" />

                               <fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">

                                               <analyzer type="index">

                                                               <tokenizer class="solr.StandardTokenizerFactory"/>

                                                               <!--<filter class="solr.ASCIIFoldingFilterFactory"/>--> <!--Converts alphabetic, numeric, and symbolic Unicode characters which are not in the first 127 ASCII characters into their ASCII equivalents, if one exists. -->

                                                               <filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->

                                                               <filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->

                                                               <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/> <!--Discards common words.  -->

                                                               <filter class="solr.PorterStemFilterFactory"/>

                                                               <!--<filter class="solr.SnowballPorterFilterFactory" language="German2" /> -->

                                                               <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>

                                               </analyzer>

                                               <analyzer type="query">

                                                               <tokenizer class="solr.StandardTokenizerFactory"/>

                                                               <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>

                                                               <filter class="solr.LowerCaseFilterFactory" />

                                                               <filter class="solr.TrimFilterFactory"/>

                                                               <filter class="solr.PorterStemFilterFactory"/>

                                                               <!--<filter class="solr.SnowballPorterFilterFactory" language="German2" /> -->

                                                               <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>

                                               </analyzer>

                               </fieldtype>

                </types>

 

                <fields>

                               <field name="signatureField" type="string" indexed="true" stored="true" multiValued="false" />

                               <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="false" />

                               <field name="id" type="string" indexed="true" stored="true" multiValued="false" />

                               <field name="rmDocumentTitle" type="string" indexed="true" stored="true" multiValued="true"/>

                               <field name="fullText" indexed="true" type="text" multiValued="true" />

                </fields>

 

                <defaultSearchField>fullText</defaultSearchField>

 

                <solrQueryParser defaultOperator="OR" />

                <uniqueKey>id</uniqueKey>

</schema>

 

 

solrconfig.xml

<?xml version="1.0" encoding="UTF-8" ?>

<config>

                …

                <requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">

                               <lst name="defaults">

                                               <str name="captureAttr">true</str>

                                               <str name="lowernames">false</str>

                                               <str name="overwrite">false</str>

                                               <str name="captureAttr">true</str>

                                               <str name="literalsOverride">true</str>

                                               <str name="uprefix">ignored_</str>

                                               <str name="fmap.a">link</str>

                                               <str name="fmap.content">fullText</str>

                                               <!-- the configuration here could be useful for tests -->

                                               <str name="update.chain">deduplication</str>

                               </lst>

                </requestHandler>

 

                <updateRequestProcessorChain name="deduplication">

                               <processor

                                               class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">

                                               <bool name="overwriteDupes">false</bool>

                                               <str name="signatureField">signatureField</str>

                                               <bool name="enabled">true</bool>

                                               <str name="fields">content</str>

                                               <str name="minTokenLen">10</str>

                                               <str name="quantRate">.2</str>

                                               <str name="signatureClass">solr.update.processor.TextProfileSignature</str>

                               </processor>

                               <processor class="solr.LogUpdateProcessorFactory" />

                               <processor class="solr.RunUpdateProcessorFactory" />

                </updateRequestProcessorChain>

 

                <requestHandler name="/admin/"

                               class="org.apache.solr.handler.admin.AdminHandlers" />

               

                <lockType>none</lockType>

               

                <admin>

                               <defaultQuery>*:*</defaultQuery>

                </admin>

</config>

 

 

Thank you for any help.

Francesco

 

Matthew Snape | 19 Mar 17:17 2014

problem with duplicate json output

Hi,

 

I am attempting to extract metadata from a pdf into JSON with the following command….

 

java -jar tika-app-1.5.jar -j example.pdf

 

This appears to give the output twice.  For example the following pdf gives the following output below which throws my JSON parser.  Am I doing something wrong?

 

Thanks.

 

Example PDF

 

http://www.dadsgarage.com/~/media/Files/example.ashx

 

Output

 

{ "Author":null,

"Content-Length":194007,

"Content-Type":"application/pdf",

"Keywords":null,

"cp:subject":null,

"creator":null,

"dc:creator":null,

"dc:subject":null,

"dc:title":null,

"meta:author":null,

"meta:keyword":null,

"producer":"dvips + GNU Ghostscript 7.05",

"resourceName":"example.pdf",

"subject":null,

"title":null,

"xmp:CreatorTool":"LaTeX with hyperref package",

"xmpTPg:NPages":10 }{ "Author":null,

"Content-Length":194007,

"Content-Type":"application/pdf",

"Keywords":null,

"cp:subject":null,

"creator":null,

"dc:creator":null,

"dc:subject":null,

"dc:title":null,

"meta:author":null,

"meta:keyword":null,

"producer":"dvips + GNU Ghostscript 7.05",

"resourceName":"example.pdf",

"subject":null,

"title":null,

"xmp:CreatorTool":"LaTeX with hyperref package",

"xmpTPg:NPages":10 }

This e-mail message and any attached file is the property of the sender and is sent in confidence to the addressee only.

Internet communications are not secure and RPS is not responsible for their abuse by third parties, any alteration or corruption in transmission or for any loss or damage caused by a virus or by any other means.

RPS Planning and Development Limited, company number: 02947164 (England). Registered office: 20 Western Avenue Milton Park Abingdon Oxfordshire OX14 4SH.

RPS Group Plc web link: http://www.rpsgroup.com

Picon

Indexing only “readable/parsable” text from pdf

I have to index a list of PDFs and for some of them there is no problem, but for others when I look the indexed content I only see a lot of diamonds with a question mark in it.

 

I think the problem is the font used for the document or that the content is "encapsulated" into a picture.

 

Is there a way to tell tika to extract only the "readable/parsable" text of a pdf?

 

When I query all the documents (with my java application) this is an ex. of what I see in the logfile for the content of the problematic files:

 

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << "  [0xe8]?[0x1]d41d8cd98f00b204e9800998ecf8427e[0xb][0xa4][0xe5][0x81](Diverses[0xe6]=aabhpdtyan3vfsujquccemebqr4m3[0xe7][0x81]?[0xc1][0x4] [\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << " E-Mail zur Archivierung [\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << "    [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">

    DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd] [\n]">

    DEBUG org.apache.http.wire -  << "  [\n]">

    DEBUG org.apache.http.wire -  << " [\n]">

    DEBUG org.apache.http.wire -  << " [0x9] data1.pdf [\n]">

 

 

 

Another problem is that for all the files (also the "good ones") at the beginning of the content field there is a long list of `\n` as you can also see above. How can avoid this?

 

 

Here is my schema.xml:

 

    <?xml version="1.0" encoding="UTF-8" ?>

    <schema name="simple" version="1.1">

                <types>

                               <fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />

                               <fieldtype name="ignored" class="solr.TextField" />

                                <fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">

                                               <analyzer>

                                                               <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>

                                                               <tokenizer class="solr.StandardTokenizerFactory"/>

                                                               <filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->

                                                               <filter class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from the end of tokens. Works only on typed tokens produced by ClassicTokenizer or equivalent.-->

                                                               <filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->

                                                               <filter class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words.  -->

                                                               <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>

                                               </analyzer>

                               </fieldtype>

                </types>

   

                <fields>

                               <field name="signatureField" type="string" indexed="true" stored="true" multiValued="false" />

                               <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="false" />

                               <field name="id" type="string" indexed="true" stored="true" multiValued="false" />

                               <field name="rmDocumentTitle" type="string" indexed="true" stored="true" multiValued="true"/>

                               <field name="fullText" indexed="true" type="text" multiValued="true" />

                </fields>

   

                <defaultSearchField>fullText</defaultSearchField>

    

                <solrQueryParser defaultOperator="OR" />

                <uniqueKey>id</uniqueKey>

    </schema>

 

and my solrconfig.xml:

 

    <?xml version="1.0" encoding="UTF-8" ?>

    <config>

                <luceneMatchVersion>LUCENE_45</luceneMatchVersion>

                <directoryFactory name='DirectoryFactory' class='solr.MMapDirectoryFactory' />

   

                <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />

   

                <lib dir='${solr.core.instanceDir}\lib' />

                <lib dir="${solr.core.instanceDir}\dist\" regex="solr-cell-\d.*\.jar" />

                <lib dir="${solr.core.instanceDir}\contrib\extraction\lib" regex=".*\.jar" />

   

                <requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />

   

                <requestHandler name="/update" class="solr.UpdateRequestHandler">

                               <lst name="defaults">

                                               <str name="update.chain">deduplication</str>

                               </lst>

                </requestHandler>

   

                <requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">

                               <lst name="defaults">

                                               <str name="captureAttr">true</str>

                                               <str name="lowernames">false</str>

                                               <str name="overwrite">false</str>

                                               <str name="captureAttr">true</str>

                                               <str name="literalsOverride">true</str>

                                               <str name="uprefix">ignored_</str>

                                               <str name="fmap.a">link</str>

                                               <str name="fmap.content">fullText</str>

                                               <!-- the configuration here could be useful for tests -->

                                               <str name="update.chain">deduplication</str>

                               </lst>

                </requestHandler>

   

                <updateRequestProcessorChain name="deduplication">

                               <processor

                                               class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">

                                               <bool name="overwriteDupes">false</bool>

                                               <str name="signatureField">signatureField</str>

                                               <bool name="enabled">true</bool>

                                               <str name="fields">content</str>

                                               <str name="minTokenLen">10</str>

                                               <str name="quantRate">.2</str>

                                               <str name="signatureClass">solr.update.processor.TextProfileSignature</str>

                               </processor>

                               <processor class="solr.LogUpdateProcessorFactory" />

                               <processor class="solr.RunUpdateProcessorFactory" />

                </updateRequestProcessorChain>

   

                <requestHandler name="/admin/"

                               class="org.apache.solr.handler.admin.AdminHandlers" />

               

                <lockType>none</lockType>

               

                <admin>

                               <defaultQuery>*:*</defaultQuery>

                </admin>

   

    </config>

Grant Ingersoll | 13 Mar 19:28 2014
Picon

Parsers, DefaultConfig and such

Myself and a colleague were parsing the Enron dataset the other day and =
noticed that a number of emails that had message bodies in them were not =
getting extracted.

In particular, when running our Tika parsing code in Hadoop distributed =
mode, the body was going missing.  If I ran the exact same code in my =
IDE in Hadoop local mode (i.e. no cluster), the message body gets =
extracted fine.

To isolate things down, we tried with the testLotusEml.eml file in =
Tika's test document suite (many of the Enron emails are Lotus) and =
noticed the same thing.  Digging in further, I thought the issue might =
be something in the RFC822Parser, since this is the MIME type of the =
document.  (In particular, I thought it would be a threading issue) =20

Turns out, however, the problem seems to be in my understanding of how =
TikaConfig.getDefaultConfig().getParser works (or doesn't work).  =
Namely, if you run the Test below (I added it to RFC822ParserTest =
locally), the first two checkParser methods pass just fine, the third =
one fails.  =20

So, I guess my questions are:=20
- what's different between how I use getDefaultConfig in local mode vs. =
Hadoop mode?  I haven't customized the config at all in either case and =
I am not aware of any SPIs registered.  (i've also reproduced the =
problem in non-dev environments -- i.e. machines only doing this =
workload w/ a clean OS)
- what's different in this test which is being run in the Tika =
development environment and presumably has the same core configuration?

(note to Julien Nioche, if you are reading this: this problem exists in =
Behemoth TikaProcessor or at least it did in the snapshot of the version =
I have)

  <at> Test
 public void testLotus() throws Exception {
   checkParser(new RFC822Parser());
   checkParser(new AutoDetectParser());
   checkParser(TikaConfig.getDefaultConfig().getParser());
 }

 private void checkParser(Parser parser) {
   Metadata metadata =3D new Metadata();
   InputStream stream =3D getStream("test-documents/testLotusEml.eml");
   ContentHandler handler =3D new BodyContentHandler();

   try {
     parser.parse(stream, handler, metadata, new ParseContext());
     String bodyText =3D handler.toString();
     assertTrue(bodyText.contains("Message body"));
   } catch (Exception e) {
     fail("Exception thrown: " + e.getMessage());
   }
 }

Thanks,
Grant

--------------------------------------------
Grant Ingersoll | <at> gsingers
http://www.lucidworks.com

Mirko Sertic | 11 Mar 13:39 2014
Picon

Performance problems with Tika 1.5 and Microsoft Office docx files

Hi there

I am encountering some performance problems while extracting content from Microsoft Office docx files
with Tika 1.5.

It seems as Tika needs about 1.3seconds to extract metadata and content per file. I am using the
Tika.parseToString() method. After some digging around with JProfiler, i discovered that Tika uses the
org.openxmlformat.schema XMLBean classes a lot. The DocumentDocument class consumes a lot of CPU time
while parsing content.

Now, how can i speedup metadata amd content extraction?

a) Is the Tika class stateful? Do i have to create a new instance for every document, or can i reuse it?
b) Are the parsers stateful? Do i have to create a new parser for every document, or can i reuse it?
c) How can i tune the org.openxmlformat.schema classes?
d) What are the best practices to run Tika in a multithreaded environment?

Thanks in advance
Mirko

Picon

Many PDFs indexed but only one returned in te Solr-UI

I followed the example here (http://searchhub.org/2012/02/14/indexing-with-solrj/) for indexing all the pdfs in a directory. The process seems to work well, but at the end, when I go in the Solr-UI and click on "Execute query"(with q=*:*), I get only one entry.

Do I miss something in my code?

    ...

    String[] files = documentDir.list();

 

    if (files != null)

    {

      for (String document : files)

      {      

        ContentHandler textHandler = new BodyContentHandler();

        Metadata metadata = new Metadata();

        ParseContext context = new ParseContext();

        AutoDetectParser autoDetectParser = new AutoDetectParser();

 

        InputStream inputStream = null;

 

        try

        {

          inputStream = new FileInputStream(new File(documentDir, document));

 

          autoDetectParser.parse(inputStream, textHandler, metadata, context);

 

          SolrInputDocument doc = new SolrInputDocument();

          doc.addField("id", document);

 

          String content = textHandler.toString();

 

          if (content != null)

          {

            doc.addField("fullText", content);

          }

 

          UpdateResponse resp = server.add(doc, 1);

 

          server.commit(true, true, true);

 

          if (resp.getStatus() != 0)

          {

            throw new IDSystemException(LOG, "Document could not be indexed. Status returned: " + resp.getStatus());

          }

        }

        catch (FileNotFoundException fnfe)

        {

          throw new IDSystemException(LOG, fnfe.getMessage(), fnfe);

        }

        catch (IOException ioe)

        {

          throw new IDSystemException(LOG, ioe.getMessage(), ioe);

        }

        catch (SAXException se)

        {

          throw new IDSystemException(LOG, se.getMessage(), se);

        }

        catch (TikaException te)

        {

          throw new IDSystemException(LOG, te.getMessage(), te);

        }

        catch (SolrServerException sse)

        {

          throw new IDSystemException(LOG, sse.getMessage(), sse);

        }

        finally

        {

          if (inputStream != null)

          {

            try

            {

              inputStream.close();

            }

            catch (IOException ioe)

           {

              throw new IDSystemException(LOG, ioe.getMessage(), ioe);

            }

          }

        }

       ...

 

Thank you for any hint.

 

Francesco

Benson Margulies | 8 Mar 19:54 2014
Picon

HWP?

Given a large pile of HWP files,

find . -name "*.hwp" -exec java -jar ~/Downloads/tika-app-1.5.jar -v -t {} \;

does not result in any text.

Is there a detector and not a parser?

Milos | 7 Mar 23:39 2014

how to limit the number of chars returned from tika server

Hello, I am using tika server 1.5 (JAX-RS). Is there any way to limit the 
number of chars returned by tika server? And how to specify default format of 
the returned text to be plain text and not html output? b

Picon

Conflict mybatis and tika-parsers on XPathFactory

To solve my problem with the file lock(see my previous mail), I tried another way.

I tried with tika-parsers:

 

        ContentHandler textHandler = new BodyContentHandler();

    Metadata metadata = new Metadata();

    ParseContext context = new ParseContext();

    AutoDetectParser autoDetectParser = new AutoDetectParser();

 

    InputStream inputStream = null;

 

    try

    {

      inputStream = new FileInputStream(document);

 

      autoDetectParser.parse(inputStream, textHandler, metadata, context);

 

      SolrInputDocument doc = new SolrInputDocument();

      doc.addField("id", document.getName());

 

      String content = metadata.get("Content");

 

      if (content != null)

      {

        doc.addField("content", content);

      }

 

      UpdateResponse resp = server.add(doc);

 

      if (resp.getStatus() != 0)

      {

        throw new IDSystemException(LOG, "Document could not be indexed. Status returned: " + resp.getStatus());

      }

    }

 

but as soon as I want to depoy my application I get the following error:

 

Task 7 initiated: [Deployer:149026]deploy application fts on AdminServer.

dumping Exception stack

Task 7 failed: [Deployer:149026]deploy application fts on AdminServer.

Target state: deploy failed on Server AdminServer

java.lang.RuntimeException: XPathFactory#newInstance() failed to create an XPathFactory for the default object model: http://java.sun.com/jaxp/xpath/dom with the XPathFactoryConfigurationException: javax.xml.xpath.XPathFactoryConfigurationException: No XPathFctory implementation found for the object model: http://java.sun.com/jaxp/xpath/dom

    at javax.xml.xpath.XPathFactory.newInstance(Unknown Source)

    at org.apache.ibatis.parsing.XPathParser.commonConstructor(XPathParser.java:261)

    at org.apache.ibatis.parsing.XPathParser.<init>(XPathParser.java:121)

    at org.apache.ibatis.builder.xml.XMLConfigBuilder.<init>(XMLConfigBuilder.java:72)

    at org.mybatis.spring.SqlSessionFactoryBean.buildSqlSessionFactory(SqlSessionFactoryBean.java:354)

    at org.mybatis.spring.SqlSessionFactoryBean.afterPropertiesSet(SqlSessionFactoryBean.java:336)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.invokeInitMethods(AbstractAutowireCapableBeanFactory.java:1571)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.initializeBean(AbstractAutowireCapableBeanFactory.java:1509)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:521)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)

    at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:296)

    at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)

    at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:293)

    at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:194)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.findAutowireCandidates(DefaultListableBeanFactory.java:912)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.doResolveDependency(DefaultListableBeanFactory.java:855)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.resolveDependency(DefaultListableBeanFactory.java:770)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor$AutowiredMethodElement.inject(AutowiredAnnotationBeanPostProcessor.java:561)

    at org.springframework.beans.factory.annotation.InjectionMetadata.inject(InjectionMetadata.java:87)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor.postProcessPropertyValues(AutowiredAnnotationBeanPostProcessor.java:286)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.populateBean(AbstractAutowireCapableBeanFactory.java:1146)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:519)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)

    at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:296)

    at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)

    at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:293)

    at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:194)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.findAutowireCandidates(DefaultListableBeanFactory.java:912)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.doResolveDependency(DefaultListableBeanFactory.java:855)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.resolveDependency(DefaultListableBeanFactory.java:770)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor$AutowiredMethodElement.inject(AutowiredAnnotationBeanPostProcessor.java:561)

    at org.springframework.beans.factory.annotation.InjectionMetadata.inject(InjectionMetadata.java:87)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor.postProcessPropertyValues(AutowiredAnnotationBeanPostProcessor.java:286)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.populateBean(AbstractAutowireCapableBeanFactory.java:1146)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:519)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)

    at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:296)

    at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)

    at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:293)

    at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:194)

    at org.springframework.beans.factory.support.BeanDefinitionValueResolver.resolveReference(BeanDefinitionValueResolver.java:323)

    at org.springframework.beans.factory.support.BeanDefinitionValueResolver.resolveValueIfNecessary(BeanDefinitionValueResolver.java:107)

    at org.springframework.beans.factory.support.BeanDefinitionValueResolver.resolveManagedList(BeanDefinitionValueResolver.java:353)

    at org.springframework.beans.factory.support.BeanDefinitionValueResolver.resolveValueIfNecessary(BeanDefinitionValueResolver.java:154)

    at org.springframework.beans.factory.support.ConstructorResolver.resolveConstructorArguments(ConstructorResolver.java:623)

    at org.springframework.beans.factory.support.ConstructorResolver.autowireConstructor(ConstructorResolver.java:148)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.autowireConstructor(AbstractAutowireCapableBeanFactory.java:1075)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBeanInstance(AbstractAutowireCapableBeanFactory.java:979)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:487)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)

    at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:296)

    at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)

    at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:293)

    at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:194)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.findAutowireCandidates(DefaultListableBeanFactory.java:912)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.doResolveDependency(DefaultListableBeanFactory.java:855)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.resolveDependency(DefaultListableBeanFactory.java:770)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor$AutowiredMethodElement.inject(AutowiredAnnotationBeanPostProcessor.java:561)

    at org.springframework.beans.factory.annotation.InjectionMetadata.inject(InjectionMetadata.java:87)

    at org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor.postProcessPropertyValues(AutowiredAnnotationBeanPostProcessor.java:286)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.populateBean(AbstractAutowireCapableBeanFactory.java:1146)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:519)

    at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)

    at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:296)

    at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)

    at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:293)

    at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:194)

    at org.springframework.beans.factory.support.DefaultListableBeanFactory.preInstantiateSingletons(DefaultListableBeanFactory.java:628)

    at org.springframework.context.support.AbstractApplicationContext.finishBeanFactoryInitialization(AbstractApplicationContext.java:932)

    at org.springframework.context.support.AbstractApplicationContext.refresh(AbstractApplicationContext.java:479)

    at org.springframework.web.context.ContextLoader.configureAndRefreshWebApplicationContext(ContextLoader.java:389)

    at org.springframework.web.context.ContextLoader.initWebApplicationContext(ContextLoader.java:294)

    at org.springframework.web.context.ContextLoaderListener.contextInitialized(ContextLoaderListener.java:112)

    at weblogic.servlet.internal.EventsManager$FireContextListenerAction.run(EventsManager.java:481)

    at weblogic.security.acl.internal.AuthenticatedSubject.doAs(AuthenticatedSubject.java:321)

    at weblogic.security.service.SecurityManager.runAs(SecurityManager.java:121)

    at weblogic.servlet.internal.EventsManager.notifyContextCreatedEvent(EventsManager.java:181)

    at weblogic.servlet.internal.WebAppServletContext.preloadResources(WebAppServletContext.java:1801)

    at weblogic.servlet.internal.WebAppServletContext.start(WebAppServletContext.java:3045)

    at weblogic.servlet.internal.WebAppModule.startContexts(WebAppModule.java:1397)

    at weblogic.servlet.internal.WebAppModule.start(WebAppModule.java:460)

    at weblogic.application.internal.flow.ModuleStateDriver$3.next(ModuleStateDriver.java:425)

    at weblogic.application.utils.StateMachineDriver.nextState(StateMachineDriver.java:83)

    at weblogic.application.internal.flow.ModuleStateDriver.start(ModuleStateDriver.java:119)

    at weblogic.application.internal.flow.ScopedModuleDriver.start(ScopedModuleDriver.java:200)

    at weblogic.application.internal.flow.ModuleListenerInvoker.start(ModuleListenerInvoker.java:247)

    at weblogic.application.internal.flow.ModuleStateDriver$3.next(ModuleStateDriver.java:425)

    at weblogic.application.utils.StateMachineDriver.nextState(StateMachineDriver.java:83)

    at weblogic.application.internal.flow.ModuleStateDriver.start(ModuleStateDriver.java:119)

    at weblogic.application.internal.flow.StartModulesFlow.activate(StartModulesFlow.java:27)

    at weblogic.application.internal.BaseDeployment$2.next(BaseDeployment.java:1267)

    at weblogic.application.utils.StateMachineDriver.nextState(StateMachineDriver.java:83)

    at weblogic.application.internal.BaseDeployment.activate(BaseDeployment.java:409)

    at weblogic.application.internal.SingleModuleDeployment.activate(SingleModuleDeployment.java:39)

    at weblogic.application.internal.DeploymentStateChecker.activate(DeploymentStateChecker.java:161)

    at weblogic.deploy.internal.targetserver.AppContainerInvoker.activate(AppContainerInvoker.java:79)

    at weblogic.deploy.internal.targetserver.operations.AbstractOperation.activate(AbstractOperation.java:569)

    at weblogic.deploy.internal.targetserver.operations.ActivateOperation.activateDeployment(ActivateOperation.java:150)

    at weblogic.deploy.internal.targetserver.operations.ActivateOperation.doCommit(ActivateOperation.java:116)

    at weblogic.deploy.internal.targetserver.operations.AbstractOperation.commit(AbstractOperation.java:323)

    at weblogic.deploy.internal.targetserver.DeploymentManager.handleDeploymentCommit(DeploymentManager.java:844)

    at weblogic.deploy.internal.targetserver.DeploymentManager.activateDeploymentList(DeploymentManager.java:1253)

    at weblogic.deploy.internal.targetserver.DeploymentManager.handleCommit(DeploymentManager.java:440)

    at weblogic.deploy.internal.targetserver.DeploymentServiceDispatcher.commit(DeploymentServiceDispatcher.java:163)

    at weblogic.deploy.service.internal.targetserver.DeploymentReceiverCallbackDeliverer.doCommitCallback(DeploymentReceiverCallbackDeliverer.java:181)

    at weblogic.deploy.service.internal.targetserver.DeploymentReceiverCallbackDeliverer.access$100(DeploymentReceiverCallbackDeliverer.java:12)

    at weblogic.deploy.service.internal.targetserver.DeploymentReceiverCallbackDeliverer$2.run(DeploymentReceiverCallbackDeliverer.java:67)

    at weblogic.work.SelfTuningWorkManagerImpl$WorkAdapterImpl.run(SelfTuningWorkManagerImpl.java:516)

    at weblogic.work.ExecuteThread.execute(ExecuteThread.java:201)

    at weblogic.work.ExecuteThread.run(ExecuteThread.java:173)

 

My pom I have dependencies for mybatis 3.2.2, mybatis-spring 1.2.0 and (new)

<dependency>

  <groupId>org.apache.tika</groupId>

  <artifactId>tika-parsers</artifactId>

  <version>1.5</version>

</dependency>

How can I solve this problem?

I tried with the latest version of mybatis, but it doesn't help...

 

 

Greetings

Francesco

 

Picon

Files locked after indexing

Hi to all,

 

I’m pretty new with solr and tika and I have a problem.

 

I have the following workflow in my (web)application:

  • download a pdf file from an archive

  • index the file

  • delete the file

 

 

My problem is that after indexing the file, it remains locked and the delete-part throws an exception.

 

Here is my code-snippet for indexing the file:

 

try

{

   ContentStreamUpdateRequest req = new ContentStreamUpdateRequest("/update/extract");

   req.addFile(file, type);

   req.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

 

   NamedList<Object> result = server.request(req);

 

   Assert.assertEquals(0, ((NamedList<?>) result.get("responseHeader")).get("status"));

}

 

I also tried the “ContentStream” way but without success:

ContentStream contentStream = null;

        

    try

    {

      contentStream = new ContentStreamBase.FileStream(document);

     

      ContentStreamUpdateRequest req = new ContentStreamUpdateRequest(UPDATE_EXTRACT_REQUEST);

      req.addContentStream(contentStream);

      req.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

 

      NamedList<Object> result = server.request(req);

 

      if (!((NamedList<?>) result.get("responseHeader")).get("status").equals(0))

      {

        throw new IDSystemException(LOG, "Document could not be indexed. Status returned: " +

                                         ((NamedList<?>) result.get("responseHeader")).get("status"));

      }

    }

   catch…

   finally

    {

      try

      {

        if(contentStream != null && contentStream.getStream() != null)

        {

          contentStream.getStream().close();

        }

      }

      catch (IOException ioe)

      {

        throw new IDSystemException(LOG, ioe.getMessage(), ioe);

      }

    }

 

 

Do I miss something?

 

Thank you

Francesco

 

Sudheshna Iyer | 25 Feb 22:43 2014
Picon

Extract metadata

Hello,

1. I have few questions about the extraction of metadata. So I wanted to join 
mailing list of Tika user group. Can you please provide the email address for 
it? 

2. How do I extract the metadata from a file? For eg:  I need author 
information. So for different files, author information is coming from 
different fields like: 
Author , meta:author , citation_author

Which one should I take?  Also I need to extract ~15 of predefined metadata 
fields like publication year , doi,.. from Metadata. 
What is the best way to extract these fields from Metadata object. 
Metadata.names() contains elements like "citation_doi". 
Should I say iterate thru metadata names and for each metadata, should I say

if(name.contains("doi") then DOI_CONST = name.getName(name)

Is there any better way to extract the metadata?


Gmane