You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tellico/src/translators/pdfimporter.cpp

282 lines
9.9 KiB

/***************************************************************************
copyright : (C) 2007 by Robby Stephenson
email : robby@periapsis.org
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of version 2 of the GNU General Public License as *
* published by the Free Software Foundation; *
* *
***************************************************************************/
#include "pdfimporter.h"
#include "tellicoimporter.h"
#include "xslthandler.h"
#include "../collections/bibtexcollection.h"
#include "../xmphandler.h"
#include "../filehandler.h"
#include "../imagefactory.h"
#include "../tellico_kernel.h"
#include "../fetch/fetchmanager.h"
#include "../fetch/crossreffetcher.h"
#include "../tellico_utils.h"
#include "../progressmanager.h"
#include "../core/netaccess.h"
#include "../tellico_debug.h"
#include <kstandarddirs.h>
#include <tdemessagebox.h>
#include <config.h>
#ifdef HAVE_POPPLER
#include <poppler-qt.h>
#endif
namespace {
static const int PDF_FILE_PREVIEW_SIZE = 196;
}
using Tellico::Import::PDFImporter;
PDFImporter::PDFImporter(const KURL::List& urls_) : Importer(urls_), m_cancelled(false) {
}
bool PDFImporter::canImport(int type_) const {
return type_ == Data::Collection::Bibtex;
}
Tellico::Data::CollPtr PDFImporter::collection() {
TQString xsltfile = ::locate("appdata", TQString::fromLatin1("xmp2tellico.xsl"));
if(xsltfile.isEmpty()) {
kdWarning() << "DropHandler::handleURL() - can not locate xmp2tellico.xsl" << endl;
return 0;
}
ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true);
item.setTotalSteps(urls().count());
connect(&item, TQ_SIGNAL(signalCancelled(ProgressItem*)), TQ_SLOT(slotCancel()));
ProgressItem::Done done(this);
const bool showProgress = options() & ImportProgress;
KURL u;
u.setPath(xsltfile);
XSLTHandler xsltHandler(u);
if(!xsltHandler.isValid()) {
kdWarning() << "DropHandler::handleURL() - invalid xslt in xmp2tellico.xsl" << endl;
return 0;
}
bool hasDOI = false;
bool hasArxiv = false;
uint j = 0;
Data::CollPtr coll;
XMPHandler xmpHandler;
KURL::List list = urls();
for(KURL::List::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) {
FileHandler::FileRef* ref = FileHandler::fileRef(*it);
if(!ref) {
continue;
}
Data::CollPtr newColl;
Data::EntryPtr entry;
TQString xmp = xmpHandler.extractXMP(ref->fileName());
// myDebug() << xmp << endl;
if(xmp.isEmpty()) {
setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
} else {
setStatusMessage(TQString());
Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp));
newColl = importer.collection();
if(!newColl || newColl->entryCount() == 0) {
kdWarning() << "DropHandler::handleURL() - no collection found" << endl;
setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file."));
} else {
entry = newColl->entries().front();
hasDOI |= !entry->field(TQString::fromLatin1("doi")).isEmpty();
}
}
if(!newColl) {
newColl = new Data::BibtexCollection(true);
}
if(!entry) {
entry = new Data::Entry(newColl);
newColl->addEntries(entry);
}
#ifdef HAVE_POPPLER
// now load from poppler
Poppler::Document* doc = Poppler::Document::load(ref->fileName());
if(doc && !doc->isLocked()) {
// now the question is, do we overwrite XMP data with Poppler data?
// for now, let's say yes conditionally
TQString s = doc->getInfo(TQString::fromLatin1("Title")).simplifyWhiteSpace();
if(!s.isEmpty()) {
entry->setField(TQString::fromLatin1("title"), s);
}
// author could be separated by commas, "and" or whatever
// we're not going to overwrite it
if(entry->field(TQString::fromLatin1("author")).isEmpty()) {
TQRegExp rx(TQString::fromLatin1("\\s*(and|,|;)\\s*"));
TQStringList authors = TQStringList::split(rx, doc->getInfo(TQString::fromLatin1("Author")).simplifyWhiteSpace());
entry->setField(TQString::fromLatin1("author"), authors.join(TQString::fromLatin1("; ")));
}
s = doc->getInfo(TQString::fromLatin1("Keywords")).simplifyWhiteSpace();
if(!s.isEmpty()) {
// keywords are also separated by semi-colons in poppler
entry->setField(TQString::fromLatin1("keyword"), s);
}
// now parse the first page text and try to guess
Poppler::Page* page = doc->getPage(0);
if(page) {
// a null rectangle means get all text on page
TQString text = page->getText(Poppler::Rectangle());
// borrowed from Referencer
TQRegExp rx(TQString::fromLatin1("(?:"
"(?:[Dd][Oo][Ii]:? *)"
"|"
"(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)"
")"
"("
"[^\\.\\s]+"
"\\."
"[^\\/\\s]+"
"\\/"
"[^\\s]+"
")"));
if(rx.search(text) > -1) {
TQString doi = rx.cap(1);
myDebug() << "PDFImporter::collection() - in PDF file, found DOI: " << doi << endl;
entry->setField(TQString::fromLatin1("doi"), doi);
hasDOI = true;
}
rx = TQRegExp(TQString::fromLatin1("arXiv:"
"("
"[^\\/\\s]+"
"[\\/\\.]"
"[^\\s]+"
")"));
if(rx.search(text) > -1) {
TQString arxiv = rx.cap(1);
myDebug() << "PDFImporter::collection() - in PDF file, found arxiv: " << arxiv << endl;
if(entry->collection()->fieldByName(TQString::fromLatin1("arxiv")) == 0) {
Data::FieldPtr field = new Data::Field(TQString::fromLatin1("arxiv"), i18n("arXiv ID"));
field->setCategory(i18n("Publishing"));
entry->collection()->addField(field);
}
entry->setField(TQString::fromLatin1("arxiv"), arxiv);
hasArxiv = true;
}
delete page;
}
} else {
myDebug() << "PDFImporter::collection() - unable to read PDF info (poppler)" << endl;
}
delete doc;
#endif
entry->setField(TQString::fromLatin1("url"), (*it).url());
// always an article?
entry->setField(TQString::fromLatin1("entry-type"), TQString::fromLatin1("article"));
TQPixmap pix = NetAccess::filePreview(ref->fileName(), PDF_FILE_PREVIEW_SIZE);
delete ref; // removes temp file
if(!pix.isNull()) {
// is png best option?
TQString id = ImageFactory::addImage(pix, TQString::fromLatin1("PNG"));
if(!id.isEmpty()) {
Data::FieldPtr field = newColl->fieldByName(TQString::fromLatin1("cover"));
if(!field && !newColl->imageFields().isEmpty()) {
field = newColl->imageFields().front();
} else if(!field) {
field = new Data::Field(TQString::fromLatin1("cover"), i18n("Front Cover"), Data::Field::Image);
newColl->addField(field);
}
entry->setField(field, id);
}
}
if(coll) {
coll->addEntries(newColl->entries());
} else {
coll = newColl;
}
if(showProgress) {
ProgressManager::self()->setProgress(this, j);
kapp->processEvents();
}
}
if(m_cancelled) {
return 0;
}
if(hasDOI) {
myDebug() << "looking for DOI" << endl;
Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI);
if(vec.isEmpty()) {
GUI::CursorSaver cs(TQt::arrowCursor);
KMessageBox::information(Kernel::self()->widget(),
i18n("Tellico is able to download information about entries with a DOI from "
"CrossRef.org. However, you must create an CrossRef account and add a new "
"data source with your account information."),
TQString(),
TQString::fromLatin1("CrossRefSourceNeeded"));
} else {
Data::EntryVec entries = coll->entries();
for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) {
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
fetcher->updateEntrySynchronous(entry);
}
}
}
}
if(m_cancelled) {
return 0;
}
if(hasArxiv) {
Data::EntryVec entries = coll->entries();
Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID);
for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) {
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
fetcher->updateEntrySynchronous(entry);
}
}
}
// finally
Data::EntryVec entries = coll->entries();
for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) {
if(entry->title().isEmpty()) {
// use file name
KURL u = entry->field(TQString::fromLatin1("url"));
entry->setField(TQString::fromLatin1("title"), u.fileName());
}
}
if(m_cancelled) {
return 0;
}
return coll;
}
void PDFImporter::slotCancel() {
m_cancelled = true;
}
#include "pdfimporter.moc"