LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
00001 /***************************************************************************
00002           ofx_preproc.cpp
00003                              -------------------
00004     copyright            : (C) 2002 by Benoit Gr�oir
00005     email                : benoitg@coeus.ca
00006 ***************************************************************************/
00012 /***************************************************************************
00013  *                                                                         *
00014  *   This program is free software; you can redistribute it and/or modify  *
00015  *   it under the terms of the GNU General Public License as published by  *
00016  *   the Free Software Foundation; either version 2 of the License, or     *
00017  *   (at your option) any later version.                                   *
00018  *                                                                         *
00019  ***************************************************************************/
00020 #include "../config.h"
00021 #include <iostream>
00022 #include <fstream>
00023 #include <cstdlib>
00024 #include <stdio.h>
00025 #include <string>
00026 #include "ParserEventGeneratorKit.h"
00027 #include "libofx.h"
00028 #include "messages.hh"
00029 #include "ofx_sgml.hh"
00030 #include "ofc_sgml.hh"
00031 #include "ofx_preproc.hh"
00032 #include "ofx_utilities.hh"
00033 #ifdef HAVE_ICONV
00034 #include <iconv.h>
00035 #endif
00036 
00037 #ifdef OS_WIN32
00038 # define DIRSEP "\\"
00039 #else
00040 # define DIRSEP "/"
00041 #endif
00042 
00043 #ifdef OS_WIN32
00044 # include "win32.hh"
00045 # include <windows.h> // for GetModuleFileName()
00046 # undef ERROR
00047 # undef DELETE
00048 #endif
00049 
00050 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
00051 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
00052 
00053 using namespace std;
00057 #ifdef MAKEFILE_DTD_PATH
00058 const int DTD_SEARCH_PATH_NUM = 4;
00059 #else
00060 const int DTD_SEARCH_PATH_NUM = 3;
00061 #endif
00062 
00066 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] =
00067 {
00068 #ifdef MAKEFILE_DTD_PATH
00069   MAKEFILE_DTD_PATH ,
00070 #endif
00071   "/usr/local/share/libofx/dtd",
00072   "/usr/share/libofx/dtd",
00073   "~"
00074 };
00075 const unsigned int READ_BUFFER_SIZE = 1024;
00076 
00081 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
00082 {
00083   LibofxContext *libofx_context;
00084   bool ofx_start = false;
00085   bool ofx_end = false;
00086   bool file_is_xml = false;
00087 
00088   ifstream input_file;
00089   ofstream tmp_file;
00090   char buffer[READ_BUFFER_SIZE];
00091   char *iconv_buffer;
00092   string s_buffer;
00093   char *filenames[3];
00094   char tmp_filename[256];
00095   int tmp_file_fd;
00096 #ifdef HAVE_ICONV
00097   iconv_t conversion_descriptor;
00098 #endif
00099   libofx_context = (LibofxContext*)ctx;
00100 
00101   if (p_filename != NULL && strcmp(p_filename, "") != 0)
00102   {
00103     message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
00104 
00105     input_file.open(p_filename);
00106     if (!input_file)
00107     {
00108       message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
00109     }
00110 
00111     mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
00112 
00113     message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
00114     tmp_file_fd = mkstemp(tmp_filename);
00115     if (tmp_file_fd)
00116     {
00117       tmp_file.open(tmp_filename);
00118       if (!tmp_file)
00119       {
00120         message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
00121         return -1;
00122       }
00123     }
00124     else
00125     {
00126       message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
00127       return -1;
00128     }
00129 
00130     if (input_file && tmp_file)
00131     {
00132       int header_separator_idx;
00133       string header_name;
00134       string header_value;
00135       string ofx_encoding;
00136       string ofx_charset;
00137       bool end_of_line;
00138       do
00139       {
00140         s_buffer.clear();
00141         bool end_of_line = false;
00142         do
00143         {
00144           input_file.get(buffer, sizeof(buffer), '\n');
00145           //cout<<buffer<<"\n";
00146           s_buffer.append(buffer);
00147           //cout<<"input_file.gcount(): "<<input_file.gcount()<<" sizeof(buffer): "<<sizeof(buffer)<<endl;
00148           if ( !input_file.eof() && (input_file.peek() == '\n'))
00149           {
00150             input_file.get(); // Discard the newline
00151             s_buffer.append("\n");
00152             end_of_line = true;
00153           }
00154           else if ( !input_file.eof() && input_file.fail())
00155           {
00156             input_file.clear();
00157           }
00158         }
00159         while (!input_file.eof() || !end_of_line);
00160 
00161         if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
00162         {
00163           message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
00164           file_is_xml = true;
00165         }
00166 
00167         int ofx_start_idx;
00168         if (ofx_start == false &&
00169             (
00170               (libofx_context->currentFileType() == OFX &&
00171                ((ofx_start_idx = s_buffer.find("<OFX>")) !=
00172                 string::npos || (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
00173               || (libofx_context->currentFileType() == OFC &&
00174                   ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
00175                    (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
00176             )
00177            )
00178         {
00179           ofx_start = true;
00180           if(file_is_xml==false)
00181           {
00182             s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
00183           }
00184           message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
00185 
00186           if(file_is_xml==true)
00187           {
00188             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00189             if(putenv(sp_charset_fixed)!=0)
00190             {
00191               message_out(ERROR, "ofx_proc_file(): putenv failed");
00192             }
00193             /* Normally the following would be "xml".
00194              * Unfortunately, opensp's generic api will garble UTF-8 if this is
00195              * set to xml.  So we set any single byte encoding to avoid messing
00196              * up UTF-8.  Unfortunately this means that non-UTF-8 files will not
00197              * get properly translated.  We'd need to manually detect the
00198              * encoding in the XML header and convert the xml with iconv like we
00199              * do for SGML to work around the problem.  Most unfortunate. */
00200             static char sp_encoding[] = "SP_ENCODING=ms-dos";
00201             if(putenv(sp_encoding)!=0)
00202             {
00203               message_out(ERROR, "ofx_proc_file(): putenv failed");
00204             }
00205           }
00206           else
00207           {
00208             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00209             if(putenv(sp_charset_fixed)!=0)
00210             {
00211               message_out(ERROR, "ofx_proc_file(): putenv failed");
00212             }
00213             static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
00214             if(putenv(sp_encoding)!=0)
00215             {
00216               message_out(ERROR, "ofx_proc_file(): putenv failed");
00217             }
00218 #ifdef HAVE_ICONV
00219           string fromcode;
00220           string tocode;
00221           if (ofx_encoding.compare("USASCII") == 0)
00222           {
00223             if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
00224             {
00225               //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
00226               fromcode = "ISO-8859-1";
00227             }
00228             else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
00229             {
00230               //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
00231               fromcode = "CP1252";
00232             }
00233             else if (ofx_charset.compare("NONE") == 0)
00234             {
00235               fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00236             }
00237             else
00238             {
00239               fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00240             }
00241           }
00242           else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
00243           {
00244                 //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
00245             fromcode = "UTF-8";
00246           }
00247           else
00248           {
00249             fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00250           }
00251           tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
00252           message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
00253           conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
00254 #endif
00255           }
00256         }
00257         else
00258         {
00259           //We are still in the headers
00260           if ((header_separator_idx = s_buffer.find(':')) != string::npos)
00261           {
00262             //Header processing
00263             header_name.assign(s_buffer.substr(0, header_separator_idx));
00264             header_value.assign(s_buffer.substr(header_separator_idx + 1));
00265             while ( header_value[header_value.length() -1 ] == '\n' ||
00266                     header_value[header_value.length() -1 ] == '\r' )
00267               header_value.erase(header_value.length() - 1);
00268             message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
00269             if (header_name.compare("ENCODING") == 0)
00270             {
00271               ofx_encoding.assign(header_value);
00272             }
00273             if (header_name.compare("CHARSET") == 0)
00274             {
00275               ofx_charset.assign(header_value);
00276             }
00277           }
00278         }
00279 
00280         if (file_is_xml==true || (ofx_start == true && ofx_end == false))
00281         {
00282           if(ofx_start == true)
00283           {
00284             /* The above test won't help us if the <OFX> tag is on the same line
00285              * as the xml header, but as opensp can't be used to parse it anyway
00286              * this isn't a great loss for now.
00287              */
00288             s_buffer = sanitize_proprietary_tags(s_buffer);
00289           }
00290           //cout<< s_buffer<<"\n";
00291           if(file_is_xml==false)
00292           {
00293 #ifdef HAVE_ICONV
00294             size_t inbytesleft = strlen(s_buffer.c_str());
00295             size_t outbytesleft = inbytesleft * 2 - 1;
00296             iconv_buffer = (char*) malloc (inbytesleft * 2);
00297             memset(iconv_buffer, 0, inbytesleft * 2);
00298 #ifdef OS_WIN32
00299             const char * inchar = (const char *)s_buffer.c_str();
00300 #else
00301             char * inchar = (char *)s_buffer.c_str();
00302 #endif
00303             char * outchar = iconv_buffer;
00304             int iconv_retval = iconv (conversion_descriptor,
00305                                       &inchar, &inbytesleft,
00306                                       &outchar, &outbytesleft);
00307             if (iconv_retval == -1)
00308             {
00309               message_out(ERROR, "ofx_proc_file(): Conversion error");
00310             }
00311             s_buffer = iconv_buffer;
00312             free (iconv_buffer);
00313 #endif
00314           }
00315           cout<<s_buffer<<"\n";
00316           tmp_file.write(s_buffer.c_str(), s_buffer.length());
00317         }
00318 
00319         if (ofx_start == true &&
00320             (
00321               (libofx_context->currentFileType() == OFX &&
00322                ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
00323                 (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
00324               || (libofx_context->currentFileType() == OFC &&
00325                   ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
00326                    (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
00327             )
00328            )
00329         {
00330           ofx_end = true;
00331           message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC>  has been found");
00332         }
00333 
00334       }
00335       while (!input_file.eof() && !input_file.bad());
00336     }
00337     input_file.close();
00338     tmp_file.close();
00339 #ifdef HAVE_ICONV
00340     if(file_is_xml==false)
00341     {
00342       iconv_close(conversion_descriptor);
00343     }
00344 #endif
00345     char filename_openspdtd[255];
00346     char filename_dtd[255];
00347     char filename_ofx[255];
00348     strncpy(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME).c_str(), 255); //The opensp sgml dtd file
00349     if (libofx_context->currentFileType() == OFX)
00350     {
00351       strncpy(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME).c_str(), 255); //The ofx dtd file
00352     }
00353     else if (libofx_context->currentFileType() == OFC)
00354     {
00355       strncpy(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME).c_str(), 255); //The ofc dtd file
00356     }
00357     else
00358     {
00359       message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00360     }
00361 
00362     if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
00363     {
00364       strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
00365       filenames[0] = filename_openspdtd;
00366       filenames[1] = filename_dtd;
00367       filenames[2] = filename_ofx;
00368       if (libofx_context->currentFileType() == OFX)
00369       {
00370         ofx_proc_sgml(libofx_context, 3, filenames);
00371       }
00372       else if (libofx_context->currentFileType() == OFC)
00373       {
00374         ofc_proc_sgml(libofx_context, 3, filenames);
00375       }
00376       else
00377       {
00378         message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00379       }
00380       if (remove(tmp_filename) != 0)
00381       {
00382         message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
00383       }
00384     }
00385     else
00386     {
00387       message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
00388     }
00389   }
00390   else
00391   {
00392     message_out(ERROR, "ofx_proc_file():No input file specified");
00393   }
00394   return 0;
00395 }
00396 
00397 
00402 string sanitize_proprietary_tags(string input_string)
00403 {
00404   unsigned int i;
00405   size_t input_string_size;
00406   bool strip = false;
00407   bool tag_open = false;
00408   int tag_open_idx = 0; //Are we within < > ?
00409   bool closing_tag_open = false; //Are we within </ > ?
00410   int orig_tag_open_idx = 0;
00411   bool proprietary_tag = false; //Are we within a proprietary element?
00412   bool proprietary_closing_tag = false;
00413   int crop_end_idx = 0;
00414   char buffer[READ_BUFFER_SIZE] = "";
00415   char tagname[READ_BUFFER_SIZE] = "";
00416   int tagname_idx = 0;
00417   char close_tagname[READ_BUFFER_SIZE] = "";
00418 
00419   for (i = 0; i < READ_BUFFER_SIZE; i++)
00420   {
00421     buffer[i] = 0;
00422     tagname[i] = 0;
00423     close_tagname[i] = 0;
00424   }
00425 
00426   input_string_size = input_string.size();
00427 
00428   for (i = 0; i <= input_string_size; i++)
00429   {
00430     if (input_string.c_str()[i] == '<')
00431     {
00432       tag_open = true;
00433       tag_open_idx = i;
00434       if (proprietary_tag == true && input_string.c_str()[i+1] == '/')
00435       {
00436         //We are now in a closing tag
00437         closing_tag_open = true;
00438         //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl;
00439         if (strncmp(tagname, &(input_string.c_str()[i+2]), strlen(tagname)) != 0)
00440         {
00441           //If it is the begining of an other tag
00442           //cout<<"DIFFERENT!"<<endl;
00443           crop_end_idx = i - 1;
00444           strip = true;
00445         }
00446         else
00447         {
00448           //Otherwise, it is the start of the closing tag of the proprietary tag
00449           proprietary_closing_tag = true;
00450         }
00451       }
00452       else if (proprietary_tag == true)
00453       {
00454         //It is the start of a new tag, following a proprietary tag
00455         crop_end_idx = i - 1;
00456         strip = true;
00457       }
00458     }
00459     else if (input_string.c_str()[i] == '>')
00460     {
00461       tag_open = false;
00462       closing_tag_open = false;
00463       tagname[tagname_idx] = 0;
00464       tagname_idx = 0;
00465       if (proprietary_closing_tag == true)
00466       {
00467         crop_end_idx = i;
00468         strip = true;
00469       }
00470     }
00471     else if (tag_open == true && closing_tag_open == false)
00472     {
00473       if (input_string.c_str()[i] == '.')
00474       {
00475         if (proprietary_tag != true)
00476         {
00477           orig_tag_open_idx = tag_open_idx;
00478           proprietary_tag = true;
00479         }
00480       }
00481       tagname[tagname_idx] = input_string.c_str()[i];
00482       tagname_idx++;
00483     }
00484     //cerr <<i<<endl;
00485     if (strip == true && orig_tag_open_idx < input_string.size())
00486     {
00487       input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00488       message_out(INFO, "sanitize_proprietary_tags() (end tag or new tag) removed: " + string(buffer));
00489       input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00490       i = orig_tag_open_idx - 1;
00491       proprietary_tag = false;
00492       proprietary_closing_tag = false;
00493       closing_tag_open = false;
00494       tag_open = false;
00495       strip = false;
00496     }
00497 
00498   }//end for
00499   if (proprietary_tag == true && orig_tag_open_idx < input_string.size())
00500   {
00501     if (crop_end_idx == 0)   //no closing tag
00502     {
00503       crop_end_idx = input_string.size() - 1;
00504     }
00505     input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00506     message_out(INFO, "sanitize_proprietary_tags() (end of line) removed: " + string(buffer));
00507     input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00508   }
00509   return input_string;
00510 }
00511 
00512 
00513 #ifdef OS_WIN32
00514 static std::string get_dtd_installation_directory()
00515 {
00516   // Partial implementation of
00517   // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
00518   char ch_fn[MAX_PATH], *p;
00519   std::string str_fn;
00520 
00521   if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
00522 
00523   if ((p = strrchr(ch_fn, '\\')) != NULL)
00524     * p = '\0';
00525 
00526   p = strrchr(ch_fn, '\\');
00527   if (p && (_stricmp(p + 1, "bin") == 0 ||
00528             _stricmp(p + 1, "lib") == 0))
00529     *p = '\0';
00530 
00531   str_fn = ch_fn;
00532   str_fn += "\\share\\libofx\\dtd";
00533 
00534   return str_fn;
00535 }
00536 #endif
00537 
00538 
00551 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
00552 {
00553   string dtd_path_filename;
00554   char *env_dtd_path;
00555 
00556   dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
00557   if (!dtd_path_filename.empty())
00558   {
00559     dtd_path_filename.append(dtd_filename);
00560     ifstream dtd_file(dtd_path_filename.c_str());
00561     if (dtd_file)
00562     {
00563       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00564       return dtd_path_filename;
00565     }
00566   }
00567 
00568 #ifdef OS_WIN32
00569   dtd_path_filename = get_dtd_installation_directory();
00570   if (!dtd_path_filename.empty())
00571   {
00572     dtd_path_filename.append(DIRSEP);
00573     dtd_path_filename.append(dtd_filename);
00574     ifstream dtd_file(dtd_path_filename.c_str());
00575     if (dtd_file)
00576     {
00577       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00578       return dtd_path_filename;
00579     }
00580   }
00581 #endif
00582   /* Search in environement variable OFX_DTD_PATH */
00583   env_dtd_path = getenv("OFX_DTD_PATH");
00584   if (env_dtd_path)
00585   {
00586     dtd_path_filename.append(env_dtd_path);
00587     dtd_path_filename.append(DIRSEP);
00588     dtd_path_filename.append(dtd_filename);
00589     ifstream dtd_file(dtd_path_filename.c_str());
00590     if (!dtd_file)
00591     {
00592       message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
00593     }
00594     else
00595     {
00596       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00597       return dtd_path_filename;
00598     }
00599   }
00600 
00601   for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
00602   {
00603     dtd_path_filename = DTD_SEARCH_PATH[i];
00604     dtd_path_filename.append(DIRSEP);
00605     dtd_path_filename.append(dtd_filename);
00606     ifstream dtd_file(dtd_path_filename.c_str());
00607     if (!dtd_file)
00608     {
00609       message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
00610     }
00611     else
00612     {
00613       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00614       return dtd_path_filename;
00615     }
00616   }
00617 
00618   /* Last resort, look in source tree relative path (useful for development) */
00619   dtd_path_filename = "";
00620   dtd_path_filename.append("..");
00621   dtd_path_filename.append(DIRSEP);
00622   dtd_path_filename.append("dtd");
00623   dtd_path_filename.append(DIRSEP);
00624   dtd_path_filename.append(dtd_filename);
00625   ifstream dtd_file(dtd_path_filename.c_str());
00626   if (!dtd_file)
00627   {
00628     message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
00629   }
00630   else
00631   {
00632     message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00633     return dtd_path_filename;
00634   }
00635 
00636 
00637   message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
00638   return "";
00639 }
00640 
00641