From 75498776319c46af3a2cda622302e85d100ed936 Mon Sep 17 00:00:00 2001
From: tqtg <tuantq.vnu@gmail.com>
Date: Wed, 25 Oct 2023 03:49:51 +0000
Subject: [PATCH] Add back libfm files

---
 cornac/models/fm/libfm/libfm/libfm.cpp        | 441 ++++++++++++++++++
 .../models/fm/libfm/libfm/tools/convert.cpp   | 205 ++++++++
 .../models/fm/libfm/libfm/tools/transpose.cpp | 170 +++++++
 3 files changed, 816 insertions(+)
 create mode 100644 cornac/models/fm/libfm/libfm/libfm.cpp
 create mode 100644 cornac/models/fm/libfm/libfm/tools/convert.cpp
 create mode 100644 cornac/models/fm/libfm/libfm/tools/transpose.cpp
diff --git a/cornac/models/fm/libfm/libfm/libfm.cpp b/cornac/models/fm/libfm/libfm/libfm.cpp
new file mode 100644
index 000000000..82b795a21
--- /dev/null
+++ b/cornac/models/fm/libfm/libfm/libfm.cpp
@@ -0,0 +1,441 @@
+// Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle
+// Contact:   srendle@libfm.org, http://www.libfm.org/
+//
+// This file is part of libFM.
+//
+// libFM is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// libFM is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with libFM.  If not, see <http://www.gnu.org/licenses/>.
+//
+//
+// libfm.cpp: main file for libFM (Factorization Machines)
+//
+// Based on the publication(s):
+// - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th
+//   IEEE International Conference on Data Mining (ICDM 2010), Sydney,
+//   Australia.
+// - Steffen Rendle, Zeno Gantner, Christoph Freudenthaler, Lars Schmidt-Thieme
+//   (2011): Fast Context-aware Recommendations with Factorization Machines, in
+//   Proceedings of the 34th international ACM SIGIR conference on Research and
+//   development in information retrieval (SIGIR 2011), Beijing, China.
+// - Christoph Freudenthaler, Lars Schmidt-Thieme, Steffen Rendle (2011):
+//   Bayesian Factorization Machines, in NIPS Workshop on Sparse Representation
+//   and Low-rank Approximation (NIPS-WS 2011), Spain.
+// - Steffen Rendle (2012): Learning Recommender Systems with Adaptive
+//   Regularization, in Proceedings of the 5th ACM International Conference on
+//   Web Search and Data Mining (WSDM 2012), Seattle, USA.
+// - Steffen Rendle (2012): Factorization Machines with libFM, ACM Transactions
+//   on Intelligent Systems and Technology (TIST 2012).
+// - Steffen Rendle (2013): Scaling Factorization Machines to Relational Data,
+//   in Proceedings of the 39th international conference on Very Large Data
+//   Bases (VLDB 2013), Trento, Italy.
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iomanip>
+#include "../util/util.h"
+#include "../util/cmdline.h"
+#include "../fm_core/fm_model.h"
+#include "src/Data.h"
+#include "src/fm_learn.h"
+#include "src/fm_learn_sgd.h"
+#include "src/fm_learn_sgd_element.h"
+#include "src/fm_learn_sgd_element_adapt_reg.h"
+#include "src/fm_learn_mcmc_simultaneous.h"
+
+
+using namespace std;
+
+int main(int argc, char **argv) {
+
+  try {
+    CMDLine cmdline(argc, argv);
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+    std::cout << "libFM" << std::endl;
+    std::cout << "  Version: 1.4.4" << std::endl;
+    std::cout << "  Author:  Steffen Rendle, srendle@libfm.org" << std::endl;
+    std::cout << "  WWW:     http://www.libfm.org/" << std::endl;
+    std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl;
+    std::cout << "This is free software, and you are welcome to redistribute it under certain" << std::endl;
+    std::cout << "conditions; for details see license.txt." << std::endl;
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+
+    const std::string param_task       = cmdline.registerParameter("task", "r=regression, c=binary classification [MANDATORY]");
+    const std::string param_meta_file  = cmdline.registerParameter("meta", "filename for meta information about data set");
+    const std::string param_train_file = cmdline.registerParameter("train", "filename for training data [MANDATORY]");
+    const std::string param_test_file  = cmdline.registerParameter("test", "filename for test data [MANDATORY]");
+    const std::string param_val_file   = cmdline.registerParameter("validation", "filename for validation data (only for SGDA)");
+    const std::string param_out        = cmdline.registerParameter("out", "filename for output");
+
+    const std::string param_dim        = cmdline.registerParameter("dim", "'k0,k1,k2': k0=use bias, k1=use 1-way interactions, k2=dim of 2-way interactions; default=1,1,8");
+    const std::string param_regular    = cmdline.registerParameter("regular", "'r0,r1,r2' for SGD and ALS: r0=bias regularization, r1=1-way regularization, r2=2-way regularization");
+    const std::string param_init_stdev = cmdline.registerParameter("init_stdev", "stdev for initialization of 2-way factors; default=0.1");
+    const std::string param_num_iter   = cmdline.registerParameter("iter", "number of iterations; default=100");
+    const std::string param_learn_rate = cmdline.registerParameter("learn_rate", "learn_rate for SGD; default=0.1");
+
+    const std::string param_method     = cmdline.registerParameter("method", "learning method (SGD, SGDA, ALS, MCMC); default=MCMC");
+
+    const std::string param_verbosity  = cmdline.registerParameter("verbosity", "how much infos to print; default=0");
+    const std::string param_r_log      = cmdline.registerParameter("rlog", "write measurements within iterations to a file; default=''");
+    const std::string param_seed       = cmdline.registerParameter("seed", "integer value, default=None");
+
+    const std::string param_help       = cmdline.registerParameter("help", "this screen");
+
+    const std::string param_relation   = cmdline.registerParameter("relation", "BS: filenames for the relations, default=''");
+
+    const std::string param_cache_size = cmdline.registerParameter("cache_size", "cache size for data storage (only applicable if data is in binary format), default=infty");
+
+    const std::string param_save_model = cmdline.registerParameter("save_model", "filename for writing the FM model");
+    const std::string param_load_model = cmdline.registerParameter("load_model", "filename for reading the FM model");
+
+    const std::string param_do_sampling  = "do_sampling";
+    const std::string param_do_multilevel  = "do_multilevel";
+    const std::string param_num_eval_cases  = "num_eval_cases";
+
+    if (cmdline.hasParameter(param_help) || (argc == 1)) {
+      cmdline.print_help();
+      return 0;
+    }
+    cmdline.checkParameters();
+
+    // Seed
+    long int seed = cmdline.getValue(param_seed, time(NULL));
+    srand ( seed );
+
+    if (! cmdline.hasParameter(param_method)) { cmdline.setValue(param_method, "mcmc"); }
+    if (! cmdline.hasParameter(param_init_stdev)) { cmdline.setValue(param_init_stdev, "0.1"); }
+    if (! cmdline.hasParameter(param_dim)) { cmdline.setValue(param_dim, "1,1,8"); }
+
+    // Check for invalid flags.
+    if (! cmdline.getValue(param_method).compare("mcmc") && cmdline.hasParameter(param_save_model)) {
+      std::cout << "WARNING: -save_model enabled only for SGD and ALS." << std::endl;
+      cmdline.removeParameter(param_save_model);
+      return 0;
+    }
+
+    if (! cmdline.getValue(param_method).compare("mcmc") && cmdline.hasParameter(param_load_model)) {
+      std::cout << "WARNING: -load_model enabled only for SGD and ALS." << std::endl;
+      cmdline.removeParameter(param_load_model);
+      return 0;
+    }
+
+    if (! cmdline.getValue(param_method).compare("als")) { // als is an mcmc without sampling and hyperparameter inference
+      cmdline.setValue(param_method, "mcmc");
+      if (! cmdline.hasParameter(param_do_sampling)) { cmdline.setValue(param_do_sampling, "0"); }
+      if (! cmdline.hasParameter(param_do_multilevel)) { cmdline.setValue(param_do_multilevel, "0"); }
+    }
+
+    // (1) Load the data
+    std::cout << "Loading train...\t" << std::endl;
+    Data train(
+      cmdline.getValue(param_cache_size, 0),
+      ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc
+      ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda
+    );
+    train.load(cmdline.getValue(param_train_file));
+    if (cmdline.getValue(param_verbosity, 0) > 0) { train.debug(); }
+
+    std::cout << "Loading test... \t" << std::endl;
+    Data test(
+      cmdline.getValue(param_cache_size, 0),
+      ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc
+      ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda
+    );
+    test.load(cmdline.getValue(param_test_file));
+    if (cmdline.getValue(param_verbosity, 0) > 0) { test.debug(); }
+
+    Data* validation = NULL;
+    if (cmdline.hasParameter(param_val_file)) {
+      if (cmdline.getValue(param_method).compare("sgda")) {
+        std::cout << "WARNING: Validation data is only used for SGDA. The data is ignored." << std::endl;
+      } else {
+        std::cout << "Loading validation set...\t" << std::endl;
+        validation = new Data(
+          cmdline.getValue(param_cache_size, 0),
+          ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc
+          ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda
+        );
+        validation->load(cmdline.getValue(param_val_file));
+        if (cmdline.getValue(param_verbosity, 0) > 0) { validation->debug(); }
+      }
+    }
+
+    DVector<RelationData*> relation;
+    // (1.2) Load relational data
+    {
+      vector<std::string> rel = cmdline.getStrValues(param_relation);
+
+      std::cout << "#relations: " << rel.size() << std::endl;
+      relation.setSize(rel.size());
+      train.relation.setSize(rel.size());
+      test.relation.setSize(rel.size());
+      for (uint i = 0; i < rel.size(); i++) {
+        relation(i) = new RelationData(
+          cmdline.getValue(param_cache_size, 0),
+          ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc
+          ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda
+        );
+        relation(i)->load(rel[i]);
+        train.relation(i).data = relation(i);
+        test.relation(i).data = relation(i);
+        train.relation(i).load(rel[i] + ".train", train.num_cases);
+        test.relation(i).load(rel[i] + ".test", test.num_cases);
+      }
+    }
+
+    // (1.3) Load meta data
+    std::cout << "Loading meta data...\t" << std::endl;
+
+    // (main table)
+    uint num_all_attribute = std::max(train.num_feature, test.num_feature);
+    if (validation != NULL) {
+      num_all_attribute = std::max(num_all_attribute, (uint) validation->num_feature);
+    }
+    DataMetaInfo meta_main(num_all_attribute);
+    if (cmdline.hasParameter(param_meta_file)) {
+      meta_main.loadGroupsFromFile(cmdline.getValue(param_meta_file));
+    }
+
+    // build the joined meta table
+    for (uint r = 0; r < train.relation.dim; r++) {
+      train.relation(r).data->attr_offset = num_all_attribute;
+      num_all_attribute += train.relation(r).data->num_feature;
+    }
+    DataMetaInfo meta(num_all_attribute);
+    {
+      meta.num_attr_groups = meta_main.num_attr_groups;
+      for (uint r = 0; r < relation.dim; r++) {
+        meta.num_attr_groups += relation(r)->meta->num_attr_groups;
+      }
+      meta.num_attr_per_group.setSize(meta.num_attr_groups);
+      meta.num_attr_per_group.init(0);
+      for (uint i = 0; i < meta_main.attr_group.dim; i++) {
+        meta.attr_group(i) = meta_main.attr_group(i);
+        meta.num_attr_per_group(meta.attr_group(i))++;
+      }
+
+      uint attr_cntr = meta_main.attr_group.dim;
+      uint attr_group_cntr = meta_main.num_attr_groups;
+      for (uint r = 0; r < relation.dim; r++) {
+        for (uint i = 0; i < relation(r)->meta->attr_group.dim; i++) {
+          meta.attr_group(i+attr_cntr) = attr_group_cntr + relation(r)->meta->attr_group(i);
+          meta.num_attr_per_group(attr_group_cntr + relation(r)->meta->attr_group(i))++;
+        }
+        attr_cntr += relation(r)->meta->attr_group.dim;
+        attr_group_cntr += relation(r)->meta->num_attr_groups;
+      }
+      if (cmdline.getValue(param_verbosity, 0) > 0) { meta.debug(); }
+    }
+    meta.num_relations = train.relation.dim;
+
+    // (2) Setup the factorization machine
+    fm_model fm;
+    {
+      fm.num_attribute = num_all_attribute;
+      fm.init_stdev = cmdline.getValue(param_init_stdev, 0.1);
+      // set the number of dimensions in the factorization
+      {
+        vector<int> dim = cmdline.getIntValues(param_dim);
+        assert(dim.size() == 3);
+        fm.k0 = dim[0] != 0;
+        fm.k1 = dim[1] != 0;
+        fm.num_factor = dim[2];
+      }
+      fm.init();
+
+    }
+
+    // (2.1) load the FM model
+    if (cmdline.hasParameter(param_load_model)) {
+      std::cout << "Reading FM model... \t" << std::endl;
+      if(!fm.loadModel(cmdline.getValue(param_load_model))){
+        std::cout << "WARNING: malformed model file. Nothing will be loaded." << std::endl;
+        fm.init();
+      }
+    }
+
+    // (3) Setup the learning method:
+    fm_learn* fml;
+    if (! cmdline.getValue(param_method).compare("sgd")) {
+      fml = new fm_learn_sgd_element();
+      ((fm_learn_sgd*)fml)->num_iter = cmdline.getValue(param_num_iter, 100);
+
+    } else if (! cmdline.getValue(param_method).compare("sgda")) {
+      assert(validation != NULL);
+      fml = new fm_learn_sgd_element_adapt_reg();
+      ((fm_learn_sgd*)fml)->num_iter = cmdline.getValue(param_num_iter, 100);
+      ((fm_learn_sgd_element_adapt_reg*)fml)->validation = validation;
+
+    } else if (! cmdline.getValue(param_method).compare("mcmc")) {
+      fm.w.init_normal(fm.init_mean, fm.init_stdev);
+      fml = new fm_learn_mcmc_simultaneous();
+      fml->validation = validation;
+      ((fm_learn_mcmc*)fml)->num_iter = cmdline.getValue(param_num_iter, 100);
+      ((fm_learn_mcmc*)fml)->num_eval_cases = cmdline.getValue(param_num_eval_cases, test.num_cases);
+
+      ((fm_learn_mcmc*)fml)->do_sample = cmdline.getValue(param_do_sampling, true);
+      ((fm_learn_mcmc*)fml)->do_multilevel = cmdline.getValue(param_do_multilevel, true);
+    } else {
+      throw "unknown method";
+    }
+    fml->fm = &fm;
+    fml->max_target = train.max_target;
+    fml->min_target = train.min_target;
+    fml->meta = &meta;
+    if (! cmdline.getValue("task").compare("r") ) {
+      fml->task = 0;
+    } else if (! cmdline.getValue("task").compare("c") ) {
+      fml->task = 1;
+      for (uint i = 0; i < train.target.dim; i++) { if (train.target(i) <= 0.0) { train.target(i) = -1.0; } else {train.target(i) = 1.0; } }
+      for (uint i = 0; i < test.target.dim; i++) { if (test.target(i) <= 0.0) { test.target(i) = -1.0; } else {test.target(i) = 1.0; } }
+      if (validation != NULL) {
+        for (uint i = 0; i < validation->target.dim; i++) { if (validation->target(i) <= 0.0) { validation->target(i) = -1.0; } else {validation->target(i) = 1.0; } }
+      }
+    } else {
+      throw "unknown task";
+    }
+
+    // (4) init the logging
+    RLog* rlog = NULL;
+    if (cmdline.hasParameter(param_r_log)) {
+      ofstream* out_rlog = NULL;
+      std::string r_log_str = cmdline.getValue(param_r_log);
+      out_rlog = new ofstream(r_log_str.c_str());
+      if (! out_rlog->is_open())  {
+        throw "Unable to open file " + r_log_str;
+      }
+      std::cout << "logging to " << r_log_str.c_str() << std::endl;
+      rlog = new RLog(out_rlog);
+    }
+
+    fml->log = rlog;
+    fml->init();
+    if (! cmdline.getValue(param_method).compare("mcmc")) {
+      // set the regularization; for als and mcmc this can be individual per group
+      {
+        vector<double> reg = cmdline.getDblValues(param_regular);
+        assert((reg.size() == 0) || (reg.size() == 1) || (reg.size() == 3) || (reg.size() == (1+meta.num_attr_groups*2)));
+        if (reg.size() == 0) {
+          fm.reg0 = 0.0;
+          fm.regw = 0.0;
+          fm.regv = 0.0;
+          ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw);
+          ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv);
+        } else if (reg.size() == 1) {
+          fm.reg0 = reg[0];
+          fm.regw = reg[0];
+          fm.regv = reg[0];
+          ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw);
+          ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv);
+        } else if (reg.size() == 3) {
+          fm.reg0 = reg[0];
+          fm.regw = reg[1];
+          fm.regv = reg[2];
+          ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw);
+          ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv);
+        } else {
+          fm.reg0 = reg[0];
+          fm.regw = 0.0;
+          fm.regv = 0.0;
+          int j = 1;
+          for (uint g = 0; g < meta.num_attr_groups; g++) {
+            ((fm_learn_mcmc*)fml)->w_lambda(g) = reg[j];
+            j++;
+          }
+          for (uint g = 0; g < meta.num_attr_groups; g++) {
+            for (int f = 0; f < fm.num_factor; f++) {
+              ((fm_learn_mcmc*)fml)->v_lambda(g,f) = reg[j];
+            }
+            j++;
+          }
+        }
+      }
+    } else {
+      // set the regularization; for standard SGD, groups are not supported
+      {
+        vector<double> reg = cmdline.getDblValues(param_regular);
+        assert((reg.size() == 0) || (reg.size() == 1) || (reg.size() == 3));
+        if (reg.size() == 0) {
+          fm.reg0 = 0.0;
+          fm.regw = 0.0;
+          fm.regv = 0.0;
+        } else if (reg.size() == 1) {
+          fm.reg0 = reg[0];
+          fm.regw = reg[0];
+          fm.regv = reg[0];
+        } else {
+          fm.reg0 = reg[0];
+          fm.regw = reg[1];
+          fm.regv = reg[2];
+        }
+      }
+    }
+    {
+      fm_learn_sgd* fmlsgd= dynamic_cast<fm_learn_sgd*>(fml);
+      if (fmlsgd) {
+        // set the learning rates (individual per layer)
+        {
+          vector<double> lr = cmdline.getDblValues(param_learn_rate);
+          assert((lr.size() == 1) || (lr.size() == 3));
+          if (lr.size() == 1) {
+            fmlsgd->learn_rate = lr[0];
+            fmlsgd->learn_rates.init(lr[0]);
+          } else {
+            fmlsgd->learn_rate = 0;
+            fmlsgd->learn_rates(0) = lr[0];
+            fmlsgd->learn_rates(1) = lr[1];
+            fmlsgd->learn_rates(2) = lr[2];
+          }
+        }
+      }
+    }
+    if (rlog != NULL) {
+      rlog->init();
+    }
+
+    if (cmdline.getValue(param_verbosity, 0) > 0) {
+      fm.debug();
+      fml->debug();
+    }
+
+    // () learn
+    fml->learn(train, test);
+
+    // () Prediction at the end  (not for mcmc and als)
+    if (cmdline.getValue(param_method).compare("mcmc")) {
+      std::cout << "Final\t" << "Train=" << fml->evaluate(train) << "\tTest=" << fml->evaluate(test) << std::endl;
+    }
+
+    // () Save prediction
+    if (cmdline.hasParameter(param_out)) {
+      DVector<double> pred;
+      pred.setSize(test.num_cases);
+      fml->predict(test, pred);
+      pred.save(cmdline.getValue(param_out));
+    }
+
+    // () save the FM model
+    if (cmdline.hasParameter(param_save_model)) {
+      std::cout << "Writing FM model to "<< cmdline.getValue(param_save_model) << std::endl;
+      fm.saveModel(cmdline.getValue(param_save_model));
+    }
+
+  } catch (std::string &e) {
+    std::cerr << std::endl << "ERROR: " << e << std::endl;
+  } catch (char const* &e) {
+    std::cerr << std::endl << "ERROR: " << e << std::endl;
+  }
+}
diff --git a/cornac/models/fm/libfm/libfm/tools/convert.cpp b/cornac/models/fm/libfm/libfm/tools/convert.cpp
new file mode 100644
index 000000000..2101c474b
--- /dev/null
+++ b/cornac/models/fm/libfm/libfm/tools/convert.cpp
@@ -0,0 +1,205 @@
+// Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle
+// Contact:   srendle@libfm.org, http://www.libfm.org/
+//
+// This file is part of libFM.
+//
+// libFM is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// libFM is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with libFM.  If not, see <http://www.gnu.org/licenses/>.
+//
+//
+// transpose: Convert a libfm-format file in a binary sparse matrix for x and
+// a dense vector for the target y.
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iomanip>
+#include "../../util/util.h"
+#include "../../util/cmdline.h"
+#include "../src/Data.h"
+
+/**
+ *
+ * Version history:
+ * 1.4.2:
+ *  changed license to GPLv3
+ * 1.4.0:
+ *  no differences, version numbers are kept in sync over all libfm tools
+ * 1.3.6:
+ *  binary mode for file access
+ * 1.3.4:
+ *  no differences, version numbers are kept in sync over all libfm tools
+ * 1.3.2:
+ *  reading without token reader class
+ * 1.0:
+ *  first version
+ */
+
+
+
+using namespace std;
+
+int main(int argc, char **argv) {
+
+  srand ( time(NULL) );
+  try {
+    CMDLine cmdline(argc, argv);
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+    std::cout << "Convert" << std::endl;
+    std::cout << "  Version: 1.4.2" << std::endl;
+    std::cout << "  Author:  Steffen Rendle, srendle@libfm.org" << std::endl;
+    std::cout << "  WWW:     http://www.libfm.org/" << std::endl;
+    std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl;
+    std::cout << "This is free software, and you are welcome to redistribute it under certain"  << std::endl;
+    std::cout << "conditions; for details see license.txt." << std::endl;
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+
+    const std::string param_ifile   = cmdline.registerParameter("ifile", "input file name, file has to be in binary sparse format [MANDATORY]");
+    const std::string param_ofilex  = cmdline.registerParameter("ofilex", "output file name for x [MANDATORY]");
+    const std::string param_ofiley  = cmdline.registerParameter("ofiley", "output file name for y [MANDATORY]");
+    const std::string param_help    = cmdline.registerParameter("help", "this screen");
+
+
+    if (cmdline.hasParameter(param_help) || (argc == 1)) {
+      cmdline.print_help();
+      return 0;
+    }
+    cmdline.checkParameters();
+
+    std::string ifile = cmdline.getValue(param_ifile);
+    std::string ofilex = cmdline.getValue(param_ofilex);
+    std::string ofiley = cmdline.getValue(param_ofiley);
+
+    uint num_rows = 0;
+    uint64 num_values = 0;
+    uint num_feature = 0;
+    bool has_feature = false;
+    DATA_FLOAT min_target = +std::numeric_limits<DATA_FLOAT>::max();
+    DATA_FLOAT max_target = -std::numeric_limits<DATA_FLOAT>::max();
+
+    // (1) determine the number of rows and the maximum feature_id
+    {
+      std::ifstream fData(ifile.c_str());
+      if (! fData.is_open()) {
+        throw "unable to open " + ifile;
+      }
+      DATA_FLOAT _value;
+      int nchar;
+      uint _feature;
+      while (!fData.eof()) {
+        std::string line;
+        std::getline(fData, line);
+        const char *pline = line.c_str();
+        while ((*pline == ' ')  || (*pline == 9)) { pline++; } // skip leading spaces
+        if ((*pline == 0)  || (*pline == '#')) { continue; }  // skip empty rows
+        if (sscanf(pline, "%f%n", &_value, &nchar) >=1) {
+          pline += nchar;
+          min_target = std::min(_value, min_target);
+          max_target = std::max(_value, max_target);
+          num_rows++;
+          while (sscanf(pline, "%d:%f%n", &_feature, &_value, &nchar) >= 2) {
+            pline += nchar;
+            num_feature = std::max(_feature, num_feature);
+            has_feature = true;
+            num_values++;
+          }
+          while ((*pline != 0) && ((*pline == ' ')  || (*pline == 9))) { pline++; } // skip trailing spaces
+          if ((*pline != 0)  && (*pline != '#')) {
+            throw "cannot parse line \"" + line + "\" at character " + pline[0];
+          }
+        } else {
+          throw "cannot parse line \"" + line + "\" at character " + pline[0];
+        }
+      }
+      fData.close();
+    }
+    if (has_feature) {
+      num_feature++; // number of feature is bigger (by one) than the largest value
+    }
+    std::cout << "num_rows=" << num_rows << "\tnum_values=" << num_values << "\tnum_features=" << num_feature << "\tmin_target=" << min_target << "\tmax_target=" << max_target << std::endl;
+
+    sparse_row<DATA_FLOAT> row;
+    row.data = new sparse_entry<DATA_FLOAT>[num_feature];
+
+    // (2) read the data and write it back simultaneously
+    {
+      std::ifstream fData(ifile.c_str());
+      if (! fData.is_open()) {
+        throw "unable to open " + ifile;
+      }
+      std::ofstream out_x(ofilex.c_str(), ios_base::out | ios_base::binary);
+      if (! out_x.is_open()) {
+        throw "unable to open " + ofilex;
+      } else {
+        file_header fh;
+        fh.id = FMATRIX_EXPECTED_FILE_ID;
+        fh.num_values = num_values;
+        fh.num_rows = num_rows;
+        fh.num_cols = num_feature;
+        fh.float_size = sizeof(DATA_FLOAT);
+        out_x.write(reinterpret_cast<char*>(&fh), sizeof(fh));
+      }
+      std::ofstream out_y(ofiley.c_str(), ios_base::out | ios_base::binary);
+      if (! out_y.is_open()) {
+        throw "unable to open " + ofiley;
+      } else {
+        uint file_version = 1;
+        uint data_size = sizeof(DATA_FLOAT);
+        out_y.write(reinterpret_cast<char*>(&file_version), sizeof(file_version));
+        out_y.write(reinterpret_cast<char*>(&data_size), sizeof(data_size));
+        out_y.write(reinterpret_cast<char*>(&num_rows), sizeof(num_rows));
+      }
+
+      DATA_FLOAT _value;
+      int nchar;
+      uint _feature;
+      while (!fData.eof()) {
+        std::string line;
+        std::getline(fData, line);
+        const char *pline = line.c_str();
+        while ((*pline == ' ')  || (*pline == 9)) { pline++; } // skip leading spaces
+        if ((*pline == 0)  || (*pline == '#')) { continue; }  // skip empty rows
+        if (sscanf(pline, "%f%n", &_value, &nchar) >=1) {
+          pline += nchar;
+          out_y.write(reinterpret_cast<char*>(&(_value)), sizeof(DATA_FLOAT));
+          row.size = 0;
+          while (sscanf(pline, "%d:%f%n", &_feature, &_value, &nchar) >= 2) {
+            pline += nchar;
+            assert(row.size < num_feature);
+            row.data[row.size].id = _feature;
+            row.data[row.size].value = _value;
+            row.size++;
+          }
+          out_x.write(reinterpret_cast<char*>(&(row.size)), sizeof(uint));
+          out_x.write(reinterpret_cast<char*>(row.data), sizeof(sparse_entry<DATA_FLOAT>)*row.size);
+          while ((*pline != 0) && ((*pline == ' ')  || (*pline == 9))) { pline++; } // skip trailing spaces
+          if ((*pline != 0)  && (*pline != '#')) {
+            throw "cannot parse line \"" + line + "\" at character " + pline[0];
+          }
+        } else {
+          throw "cannot parse line \"" + line + "\" at character " + pline[0];
+        }
+      }
+      fData.close();
+      out_x.close();
+      out_y.close();
+
+    }
+  } catch (std::string &e) {
+    std::cerr << e << std::endl;
+  }
+
+}
diff --git a/cornac/models/fm/libfm/libfm/tools/transpose.cpp b/cornac/models/fm/libfm/libfm/tools/transpose.cpp
new file mode 100644
index 000000000..0ac85c37d
--- /dev/null
+++ b/cornac/models/fm/libfm/libfm/tools/transpose.cpp
@@ -0,0 +1,170 @@
+// Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle
+// Contact:   srendle@libfm.org, http://www.libfm.org/
+//
+// This file is part of libFM.
+//
+// libFM is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// libFM is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with libFM.  If not, see <http://www.gnu.org/licenses/>.
+//
+//
+// transpose: Transposes a matrix in binary sparse format.
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iomanip>
+#include "../../util/util.h"
+#include "../../util/cmdline.h"
+#include "../src/Data.h"
+
+/**
+ *
+ * Version history:
+ * 1.4.2:
+ *  changed license to GPLv3
+ * 1.4.0:
+ *  default cache size is 200 MB
+ * 1.3.6:
+ *  binary mode for file access
+ * 1.3.4:
+ *  no differences, version numbers are kept in sync over all libfm tools
+ * 1.3.2:
+ *  no differences, version numbers are kept in sync over all libfm tools
+ * 1.0:
+ *  first version
+ */
+
+
+using namespace std;
+
+int main(int argc, char **argv) {
+
+  srand ( time(NULL) );
+  try {
+    CMDLine cmdline(argc, argv);
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+    std::cout << "Transpose" << std::endl;
+    std::cout << "  Version: 1.4.2" << std::endl;
+    std::cout << "  Author:  Steffen Rendle, srendle@libfm.org" << std::endl;
+    std::cout << "  WWW:     http://www.libfm.org/" << std::endl;
+    std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl;
+    std::cout << "This is free software, and you are welcome to redistribute it under certain"  << std::endl;
+    std::cout << "conditions; for details see license.txt." << std::endl;
+    std::cout << "----------------------------------------------------------------------------" << std::endl;
+
+    const std::string param_ifile  = cmdline.registerParameter("ifile", "input file name, file has to be in binary sparse format [MANDATORY]");
+    const std::string param_ofile  = cmdline.registerParameter("ofile", "output file name [MANDATORY]");
+
+    const std::string param_cache_size = cmdline.registerParameter("cache_size", "cache size for data storage, default=200000000");
+    const std::string param_help       = cmdline.registerParameter("help", "this screen");
+
+
+    if (cmdline.hasParameter(param_help) || (argc == 1)) {
+      cmdline.print_help();
+      return 0;
+    }
+    cmdline.checkParameters();
+
+
+    // (1) Load the data
+    long long cache_size = cmdline.getValue(param_cache_size, 200000000);
+    cache_size /= 2;
+    LargeSparseMatrixHD<DATA_FLOAT> d_in(cmdline.getValue(param_ifile), cache_size);
+    std::cout << "num_rows=" << d_in.getNumRows() << "\tnum_values=" << d_in.getNumValues() << "\tnum_features=" << d_in.getNumCols() << std::endl;
+
+    // (2) transpose the data
+    // (2.1) count how many entries per col (=transpose-row) there are:
+    DVector<uint> entries_per_col(d_in.getNumCols());
+    entries_per_col.init(0);
+    for (d_in.begin(); !d_in.end(); d_in.next() ) {
+      sparse_row<DATA_FLOAT>& row = d_in.getRow();
+      for (uint j = 0; j < row.size; j++) {
+        entries_per_col(row.data[j].id)++;
+      }
+    }
+    // (2.2) build a
+    std::string ofile = cmdline.getValue(param_ofile);
+    std::cout << "output to " << ofile << std::endl; std::cout.flush();
+    std::ofstream out(ofile.c_str(), ios_base::out | ios_base::binary);
+    if (out.is_open()) {
+      file_header fh;
+      fh.id = FMATRIX_EXPECTED_FILE_ID;
+      fh.num_values = d_in.getNumValues();
+      fh.num_rows = d_in.getNumCols();
+      fh.num_cols = d_in.getNumRows();
+      fh.float_size = sizeof(DATA_FLOAT);
+      out.write(reinterpret_cast<char*>(&fh), sizeof(fh));
+
+      DVector< sparse_row<DATA_FLOAT> > out_row_cache;
+      DVector< sparse_entry<DATA_FLOAT> > out_entry_cache;
+      {
+        // determine cache sizes automatically:
+        double avg_entries_per_line = (double) d_in.getNumValues() / d_in.getNumCols();
+        uint num_rows_in_cache = cache_size / (sizeof(sparse_entry<DATA_FLOAT>) * avg_entries_per_line + sizeof(uint));
+        num_rows_in_cache = std::min(num_rows_in_cache, d_in.getNumCols());
+        uint64 num_entries_in_cache = (cache_size - sizeof(uint)*num_rows_in_cache) / sizeof(sparse_entry<DATA_FLOAT>);
+        num_entries_in_cache = std::min(num_entries_in_cache, d_in.getNumValues());
+        std::cout << "num entries in cache=" << num_entries_in_cache << "\tnum rows in cache=" << num_rows_in_cache << std::endl;
+        out_entry_cache.setSize(num_entries_in_cache);
+        out_row_cache.setSize(num_rows_in_cache);
+      }
+
+      uint out_cache_col_position = 0; // the first column id that is in cache
+      uint out_cache_col_num = 0; // how many columns are in the cache
+
+      while (out_cache_col_position < d_in.getNumCols()) {
+        // assign cache sizes
+        {
+          uint entry_cache_pos = 0;
+          // while (there is enough space in the entry cache for the next row) and (there is space for another row) and (there is another row in the data) do
+          while (((entry_cache_pos + entries_per_col(out_cache_col_position + out_cache_col_num)) < out_entry_cache.dim) && ((out_cache_col_num+1) < out_row_cache.dim) && ((out_cache_col_position+out_cache_col_num) < d_in.getNumCols())) {
+            out_row_cache(out_cache_col_num).size = 0;
+            out_row_cache(out_cache_col_num).data = &(out_entry_cache.value[entry_cache_pos]);
+            entry_cache_pos += entries_per_col(out_cache_col_position + out_cache_col_num);
+            out_cache_col_num++;
+          }
+        }
+        assert(out_cache_col_num > 0);
+        // fill the cache
+        for (d_in.begin(); !d_in.end(); d_in.next() ) {
+          sparse_row<DATA_FLOAT>& row = d_in.getRow();
+          for (uint j = 0; j < row.size; j++) {
+            if ((row.data[j].id >= out_cache_col_position) && (row.data[j].id < (out_cache_col_position+out_cache_col_num))) {
+              uint cache_row_index = row.data[j].id-out_cache_col_position;
+              out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].id = d_in.getRowIndex();
+              out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].value = row.data[j].value;
+              out_row_cache(cache_row_index).size++;
+            }
+          }
+        }
+
+        for (uint i = 0; i < out_cache_col_num; i++) {
+          assert(out_row_cache(i).size == entries_per_col(i + out_cache_col_position));
+          out.write(reinterpret_cast<char*>(&(out_row_cache(i).size)), sizeof(uint));
+          out.write(reinterpret_cast<char*>(out_row_cache(i).data), sizeof(sparse_entry<DATA_FLOAT>)*out_row_cache(i).size);
+        }
+        out_cache_col_position += out_cache_col_num;
+        out_cache_col_num = 0;
+      }
+      out.close();
+    } else {
+      throw "could not open " + ofile;
+    }
+
+  } catch (std::string &e) {
+    std::cerr << e << std::endl;
+  }
+}