Skip to content

Commit

Permalink
Closes #1462: Run experiments with Grobid as a potential Cermine repl…
Browse files Browse the repository at this point in the history
…acement

WIP: first version of Grobid wrapper. Not working at the moment due to java class version mismatch requiring the whole IIS to be compiled with java11 or newer.Y
  • Loading branch information
marekhorst committed Aug 9, 2024
1 parent c04b086 commit 9ed1e44
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 0 deletions.
5 changes: 5 additions & 0 deletions iis-wf/iis-wf-metadataextraction/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@
<artifactId>cermine-impl</artifactId>
</dependency>

<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-core</artifactId>
</dependency>

<dependency>
<groupId>org.jdom</groupId>
<artifactId>jdom</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package eu.dnetlib.iis.wf.metadataextraction;

import java.io.File;
import java.util.Collections;

import org.grobid.core.*;
import org.grobid.core.data.*;
import org.grobid.core.factory.*;
import org.grobid.core.main.GrobidHomeFinder;
import org.grobid.core.main.LibraryLoader;
import org.grobid.core.utilities.*;
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.config.GrobidAnalysisConfig;

/**
* Simple content extractor relying on Grobid library.
*
* @author mhorst
*/
public class GrobidBasedContentExtractor {

private static String grobidHome = "/home/mhorst/grobid-home";

public static void initializeGrobid() {

GrobidHomeFinder grobidHomeFinder = new GrobidHomeFinder(Collections.singletonList(grobidHome));
GrobidProperties.getInstance(grobidHomeFinder);

// Load native libraries required for Grobid
LibraryLoader.load();
}

public static void main(String[] args) {
// TODO Auto-generated method stub
String fileLocation = "/home/mhorst/Downloads/HPM-35-1009.pdf";

extractMeta(fileLocation);
extractText(fileLocation);
}

private static void extractMeta(String fileLocation) {
Engine engine = GrobidFactory.getInstance().createEngine();
// 0 - no consolidation
int consolidate = 0;
try {
BiblioItem result = new BiblioItem();
engine.processHeader(fileLocation, consolidate, result);
System.out.println("Title: " + result.getTitle());
System.out.println("Authors: " + result.getAuthors());
System.out.println("Abstract: " + result.getAbstract());
} catch (Exception e) {
e.printStackTrace();
}
}

private static void extractText(String fileLocation) {
Engine engine = GrobidFactory.getInstance().createEngine();
try {
GrobidAnalysisConfig config = GrobidAnalysisConfig.builder().consolidateHeader(0).build();
File file = new File(fileLocation);
String fullText = engine.fullTextToTEI(file, config);
System.out.println("Extracted Text: " + fullText);
} catch (Exception e) {
e.printStackTrace();
}
}

}
11 changes: 11 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>grobid</id>
<name>GROBID DIY repo</name>
<url>https://grobid.s3.eu-west-1.amazonaws.com/repo/</url>
</repository>
</repositories>

<dependencies>
Expand Down Expand Up @@ -728,6 +733,12 @@
<artifactId>simpleclient_pushgateway</artifactId>
<version>0.8.1</version>
</dependency>

<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-core</artifactId>
<version>0.8.0</version>
</dependency>

<dependency>
<groupId>org.junit</groupId>
Expand Down

0 comments on commit 9ed1e44

Please sign in to comment.