From afb36656f930daf68be6a2f0fdb72f012fbdbe4e Mon Sep 17 00:00:00 2001 From: Svjatoslav Agejenko Date: Wed, 10 Jun 2015 23:12:36 +0300 Subject: [PATCH] added OCR capability --- .../meviz/encoder/FormatsRegistry.java | 4 ++ .../meviz/encoder/converters/Ocr.java | 48 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 src/main/java/eu/svjatoslav/meviz/encoder/converters/Ocr.java diff --git a/src/main/java/eu/svjatoslav/meviz/encoder/FormatsRegistry.java b/src/main/java/eu/svjatoslav/meviz/encoder/FormatsRegistry.java index d023f02..6b8d129 100755 --- a/src/main/java/eu/svjatoslav/meviz/encoder/FormatsRegistry.java +++ b/src/main/java/eu/svjatoslav/meviz/encoder/FormatsRegistry.java @@ -19,6 +19,7 @@ import eu.svjatoslav.meviz.encoder.converters.Convert; import eu.svjatoslav.meviz.encoder.converters.Ffmpeg2theora; import eu.svjatoslav.meviz.encoder.converters.Flac; import eu.svjatoslav.meviz.encoder.converters.Midi2Wav; +import eu.svjatoslav.meviz.encoder.converters.Ocr; import eu.svjatoslav.meviz.encoder.converters.Ogg2Wav; public class FormatsRegistry { @@ -33,6 +34,9 @@ public class FormatsRegistry { // image conversion registerEncoder(new Convert()); + // image to text (OCR) + registerEncoder(new Ocr()); + // audio conversion registerEncoder(new Ogg2Wav()); registerEncoder(new Flac()); diff --git a/src/main/java/eu/svjatoslav/meviz/encoder/converters/Ocr.java b/src/main/java/eu/svjatoslav/meviz/encoder/converters/Ocr.java new file mode 100644 index 0000000..0c20018 --- /dev/null +++ b/src/main/java/eu/svjatoslav/meviz/encoder/converters/Ocr.java @@ -0,0 +1,48 @@ +/* + * Meviz - Various tools collection to work with multimedia. + * Copyright (C) 2012, Svjatoslav Agejenko, svjatoslav@svjatoslav.eu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +package eu.svjatoslav.meviz.encoder.converters; + +import java.io.File; +import java.util.List; + +import eu.svjatoslav.meviz.encoder.EncodingOptions; + +public class Ocr extends AbstractConverter { + + @Override + public String getCommand(final File inputFile, final File targetFile, + final EncodingOptions options, String targetFormat) { + + // for some stupid reason tesseract ALWAYS insists on automatically + // adding txt suffix + String targetAbsolutePath = targetFile.getAbsolutePath(); + String targetFileName = targetAbsolutePath.substring(0, + targetAbsolutePath.length() - 4); + + return "tesseract \"" + inputFile.getAbsolutePath() + "\" \"" + + targetFileName + "\""; + } + + @Override + public List getSourceFileExtensions() { + return toList("tif", "tiff", "png", "jpg", "jpeg"); + } + + @Override + public List getTargetFileExtensions() { + return toList("txt"); + } + + @Override + public boolean isTerminalMandatory() { + return false; + } + +} -- 2.20.1