/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "filterdetect.hxx" #include #include #include #include #include #include #include #include #include #include #include #define WRITER_TEXT_FILTER "Text" #define CALC_TEXT_FILTER "Text - txt - csv (StarCalc)" #define WEB_HTML_FILTER "HTML" #define WRITER_HTML_FILTER "HTML (StarWriter)" #define CALC_HTML_FILTER "calc_HTML_WebQuery" #define WRITER_DOCSERVICE "com.sun.star.text.TextDocument" #define CALC_DOCSERVICE "com.sun.star.sheet.SpreadsheetDocument" using namespace ::com::sun::star; using utl::MediaDescriptor; namespace { bool IsHTMLStream( const uno::Reference& xInStream ) { std::unique_ptr pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) ); if ( !pInStream || pInStream->GetError() ) // No stream return false; // Read the stream header pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); const sal_uInt64 nUniPos = pInStream->Tell(); const sal_uInt16 nSize = 4096; OString sHeader; if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode sHeader = read_uInt8s_ToOString( *pInStream, nSize ); else // UTF-16 (nUniPos = 2) sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US ); // Now check whether the stream begins with a known HTML tag. enum DetectPhase { BeforeTag, TagOpened, InTagName }; DetectPhase dp = BeforeTag; /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration. enum DeclarationPhase { BeforeDeclaration, DeclarationOpened }; DeclarationPhase eDeclaration = BeforeDeclaration; const char* pHeader = sHeader.getStr(); const int nLength = sHeader.getLength(); int i = 0, nStartOfTagIndex = 0; for ( i = 0; i < nLength; ++i, ++pHeader ) { char c = *pHeader; if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f') && eDeclaration == BeforeDeclaration) { if ( dp == TagOpened ) return false; // Invalid: Should start with a tag name else if ( dp == InTagName ) break; // End of tag name reached } else if ( c == '<' ) { if ( dp == BeforeTag ) dp = TagOpened; else return false; // Invalid: Nested '<' } else if ( c == '>' ) { if ( dp == InTagName ) break; // End of tag name reached else if (eDeclaration == DeclarationOpened) { dp = BeforeTag; eDeclaration = BeforeDeclaration; } else return false; // Invalid: Empty tag or before '<' } else if ( c == '!' ) { if ( dp == TagOpened ) return true; // "& lDescriptor) { MediaDescriptor aMediaDesc(lDescriptor); OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME(), OUString() ); OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE(), OUString() ); if ((aType == "generic_HTML") || (aType == "calc_HTML")) { uno::Reference xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY); if (!xInStream.is() || !IsHTMLStream(xInStream)) return OUString(); if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML")) aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_HTML_FILTER); else if (aDocService == WRITER_DOCSERVICE) aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_HTML_FILTER); else aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WEB_HTML_FILTER); } else if (aType == "generic_Text") { uno::Reference xStream(aMediaDesc[MediaDescriptor::PROP_STREAM()], uno::UNO_QUERY); uno::Reference xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()], uno::UNO_QUERY); if (xStream.is() || xInStream.is()) { ZCodec aCodecGZ; std::unique_ptr pInStream; if (xStream.is()) pInStream = utl::UcbStreamHelper::CreateStream(xStream); else pInStream = utl::UcbStreamHelper::CreateStream(xInStream); std::unique_ptr pDecompressedStream(new SvMemoryStream()); if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream)) { uno::Reference xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream))); aMediaDesc[MediaDescriptor::PROP_STREAM()] <<= xStreamDecompressed; aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM()] <<= xStreamDecompressed->getInputStream(); OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ); sal_Int32 nIdx = aURL.lastIndexOf(".gz"); if (nIdx != -1) aMediaDesc[MediaDescriptor::PROP_URL()] <<= aURL.copy(0, nIdx); } } // Get the file name extension. INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL(), OUString() ) ); OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DecodeMechanism::WithCharset); aExt = aExt.toAsciiLowerCase(); OUString aName = aParser.getName().toAsciiLowerCase(); // Decide which filter to use based on the document service first, // then on extension if that's not available. if (aDocService == CALC_DOCSERVICE) aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); else if (aDocService == WRITER_DOCSERVICE) aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz")) aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(CALC_TEXT_FILTER); else aMediaDesc[MediaDescriptor::PROP_FILTERNAME()] <<= OUString(WRITER_TEXT_FILTER); } else // Nothing to detect. return OUString(); aMediaDesc >> lDescriptor; return aType; } // XInitialization void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence& /*aArguments*/) { } OUString PlainTextFilterDetect_getImplementationName() { return OUString("com.sun.star.comp.filters.PlainTextFilterDetect"); } uno::Sequence PlainTextFilterDetect_getSupportedServiceNames() { uno::Sequence aRet(2); OUString* pArray = aRet.getArray(); pArray[0] = "com.sun.star.document.ExtendedTypeDetection"; pArray[1] = "com.sun.star.comp.filters.PlainTextFilterDetect"; return aRet; } // XServiceInfo OUString SAL_CALL PlainTextFilterDetect::getImplementationName() { return PlainTextFilterDetect_getImplementationName(); } sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName) { return cppu::supportsService(this, rServiceName); } uno::Sequence SAL_CALL PlainTextFilterDetect::getSupportedServiceNames() { return PlainTextFilterDetect_getSupportedServiceNames(); } extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface* com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext* , css::uno::Sequence const &) { return cppu::acquire(new PlainTextFilterDetect); } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */