From 75498776319c46af3a2cda622302e85d100ed936 Mon Sep 17 00:00:00 2001 From: tqtg Date: Wed, 25 Oct 2023 03:49:51 +0000 Subject: [PATCH] Add back libfm files --- cornac/models/fm/libfm/libfm/libfm.cpp | 441 ++++++++++++++++++ .../models/fm/libfm/libfm/tools/convert.cpp | 205 ++++++++ .../models/fm/libfm/libfm/tools/transpose.cpp | 170 +++++++ 3 files changed, 816 insertions(+) create mode 100644 cornac/models/fm/libfm/libfm/libfm.cpp create mode 100644 cornac/models/fm/libfm/libfm/tools/convert.cpp create mode 100644 cornac/models/fm/libfm/libfm/tools/transpose.cpp diff --git a/cornac/models/fm/libfm/libfm/libfm.cpp b/cornac/models/fm/libfm/libfm/libfm.cpp new file mode 100644 index 000000000..82b795a21 --- /dev/null +++ b/cornac/models/fm/libfm/libfm/libfm.cpp @@ -0,0 +1,441 @@ +// Copyright (C) 2010, 2011, 2012, 2013, 2014 Steffen Rendle +// Contact: srendle@libfm.org, http://www.libfm.org/ +// +// This file is part of libFM. +// +// libFM is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// libFM is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with libFM. If not, see . +// +// +// libfm.cpp: main file for libFM (Factorization Machines) +// +// Based on the publication(s): +// - Steffen Rendle (2010): Factorization Machines, in Proceedings of the 10th +// IEEE International Conference on Data Mining (ICDM 2010), Sydney, +// Australia. +// - Steffen Rendle, Zeno Gantner, Christoph Freudenthaler, Lars Schmidt-Thieme +// (2011): Fast Context-aware Recommendations with Factorization Machines, in +// Proceedings of the 34th international ACM SIGIR conference on Research and +// development in information retrieval (SIGIR 2011), Beijing, China. +// - Christoph Freudenthaler, Lars Schmidt-Thieme, Steffen Rendle (2011): +// Bayesian Factorization Machines, in NIPS Workshop on Sparse Representation +// and Low-rank Approximation (NIPS-WS 2011), Spain. +// - Steffen Rendle (2012): Learning Recommender Systems with Adaptive +// Regularization, in Proceedings of the 5th ACM International Conference on +// Web Search and Data Mining (WSDM 2012), Seattle, USA. +// - Steffen Rendle (2012): Factorization Machines with libFM, ACM Transactions +// on Intelligent Systems and Technology (TIST 2012). +// - Steffen Rendle (2013): Scaling Factorization Machines to Relational Data, +// in Proceedings of the 39th international conference on Very Large Data +// Bases (VLDB 2013), Trento, Italy. + +#include +#include +#include +#include +#include +#include +#include +#include "../util/util.h" +#include "../util/cmdline.h" +#include "../fm_core/fm_model.h" +#include "src/Data.h" +#include "src/fm_learn.h" +#include "src/fm_learn_sgd.h" +#include "src/fm_learn_sgd_element.h" +#include "src/fm_learn_sgd_element_adapt_reg.h" +#include "src/fm_learn_mcmc_simultaneous.h" + + +using namespace std; + +int main(int argc, char **argv) { + + try { + CMDLine cmdline(argc, argv); + std::cout << "----------------------------------------------------------------------------" << std::endl; + std::cout << "libFM" << std::endl; + std::cout << " Version: 1.4.4" << std::endl; + std::cout << " Author: Steffen Rendle, srendle@libfm.org" << std::endl; + std::cout << " WWW: http://www.libfm.org/" << std::endl; + std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl; + std::cout << "This is free software, and you are welcome to redistribute it under certain" << std::endl; + std::cout << "conditions; for details see license.txt." << std::endl; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + const std::string param_task = cmdline.registerParameter("task", "r=regression, c=binary classification [MANDATORY]"); + const std::string param_meta_file = cmdline.registerParameter("meta", "filename for meta information about data set"); + const std::string param_train_file = cmdline.registerParameter("train", "filename for training data [MANDATORY]"); + const std::string param_test_file = cmdline.registerParameter("test", "filename for test data [MANDATORY]"); + const std::string param_val_file = cmdline.registerParameter("validation", "filename for validation data (only for SGDA)"); + const std::string param_out = cmdline.registerParameter("out", "filename for output"); + + const std::string param_dim = cmdline.registerParameter("dim", "'k0,k1,k2': k0=use bias, k1=use 1-way interactions, k2=dim of 2-way interactions; default=1,1,8"); + const std::string param_regular = cmdline.registerParameter("regular", "'r0,r1,r2' for SGD and ALS: r0=bias regularization, r1=1-way regularization, r2=2-way regularization"); + const std::string param_init_stdev = cmdline.registerParameter("init_stdev", "stdev for initialization of 2-way factors; default=0.1"); + const std::string param_num_iter = cmdline.registerParameter("iter", "number of iterations; default=100"); + const std::string param_learn_rate = cmdline.registerParameter("learn_rate", "learn_rate for SGD; default=0.1"); + + const std::string param_method = cmdline.registerParameter("method", "learning method (SGD, SGDA, ALS, MCMC); default=MCMC"); + + const std::string param_verbosity = cmdline.registerParameter("verbosity", "how much infos to print; default=0"); + const std::string param_r_log = cmdline.registerParameter("rlog", "write measurements within iterations to a file; default=''"); + const std::string param_seed = cmdline.registerParameter("seed", "integer value, default=None"); + + const std::string param_help = cmdline.registerParameter("help", "this screen"); + + const std::string param_relation = cmdline.registerParameter("relation", "BS: filenames for the relations, default=''"); + + const std::string param_cache_size = cmdline.registerParameter("cache_size", "cache size for data storage (only applicable if data is in binary format), default=infty"); + + const std::string param_save_model = cmdline.registerParameter("save_model", "filename for writing the FM model"); + const std::string param_load_model = cmdline.registerParameter("load_model", "filename for reading the FM model"); + + const std::string param_do_sampling = "do_sampling"; + const std::string param_do_multilevel = "do_multilevel"; + const std::string param_num_eval_cases = "num_eval_cases"; + + if (cmdline.hasParameter(param_help) || (argc == 1)) { + cmdline.print_help(); + return 0; + } + cmdline.checkParameters(); + + // Seed + long int seed = cmdline.getValue(param_seed, time(NULL)); + srand ( seed ); + + if (! cmdline.hasParameter(param_method)) { cmdline.setValue(param_method, "mcmc"); } + if (! cmdline.hasParameter(param_init_stdev)) { cmdline.setValue(param_init_stdev, "0.1"); } + if (! cmdline.hasParameter(param_dim)) { cmdline.setValue(param_dim, "1,1,8"); } + + // Check for invalid flags. + if (! cmdline.getValue(param_method).compare("mcmc") && cmdline.hasParameter(param_save_model)) { + std::cout << "WARNING: -save_model enabled only for SGD and ALS." << std::endl; + cmdline.removeParameter(param_save_model); + return 0; + } + + if (! cmdline.getValue(param_method).compare("mcmc") && cmdline.hasParameter(param_load_model)) { + std::cout << "WARNING: -load_model enabled only for SGD and ALS." << std::endl; + cmdline.removeParameter(param_load_model); + return 0; + } + + if (! cmdline.getValue(param_method).compare("als")) { // als is an mcmc without sampling and hyperparameter inference + cmdline.setValue(param_method, "mcmc"); + if (! cmdline.hasParameter(param_do_sampling)) { cmdline.setValue(param_do_sampling, "0"); } + if (! cmdline.hasParameter(param_do_multilevel)) { cmdline.setValue(param_do_multilevel, "0"); } + } + + // (1) Load the data + std::cout << "Loading train...\t" << std::endl; + Data train( + cmdline.getValue(param_cache_size, 0), + ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc + ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda + ); + train.load(cmdline.getValue(param_train_file)); + if (cmdline.getValue(param_verbosity, 0) > 0) { train.debug(); } + + std::cout << "Loading test... \t" << std::endl; + Data test( + cmdline.getValue(param_cache_size, 0), + ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc + ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda + ); + test.load(cmdline.getValue(param_test_file)); + if (cmdline.getValue(param_verbosity, 0) > 0) { test.debug(); } + + Data* validation = NULL; + if (cmdline.hasParameter(param_val_file)) { + if (cmdline.getValue(param_method).compare("sgda")) { + std::cout << "WARNING: Validation data is only used for SGDA. The data is ignored." << std::endl; + } else { + std::cout << "Loading validation set...\t" << std::endl; + validation = new Data( + cmdline.getValue(param_cache_size, 0), + ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc + ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda + ); + validation->load(cmdline.getValue(param_val_file)); + if (cmdline.getValue(param_verbosity, 0) > 0) { validation->debug(); } + } + } + + DVector relation; + // (1.2) Load relational data + { + vector rel = cmdline.getStrValues(param_relation); + + std::cout << "#relations: " << rel.size() << std::endl; + relation.setSize(rel.size()); + train.relation.setSize(rel.size()); + test.relation.setSize(rel.size()); + for (uint i = 0; i < rel.size(); i++) { + relation(i) = new RelationData( + cmdline.getValue(param_cache_size, 0), + ! (!cmdline.getValue(param_method).compare("mcmc")), // no original data for mcmc + ! (!cmdline.getValue(param_method).compare("sgd") || !cmdline.getValue(param_method).compare("sgda")) // no transpose data for sgd, sgda + ); + relation(i)->load(rel[i]); + train.relation(i).data = relation(i); + test.relation(i).data = relation(i); + train.relation(i).load(rel[i] + ".train", train.num_cases); + test.relation(i).load(rel[i] + ".test", test.num_cases); + } + } + + // (1.3) Load meta data + std::cout << "Loading meta data...\t" << std::endl; + + // (main table) + uint num_all_attribute = std::max(train.num_feature, test.num_feature); + if (validation != NULL) { + num_all_attribute = std::max(num_all_attribute, (uint) validation->num_feature); + } + DataMetaInfo meta_main(num_all_attribute); + if (cmdline.hasParameter(param_meta_file)) { + meta_main.loadGroupsFromFile(cmdline.getValue(param_meta_file)); + } + + // build the joined meta table + for (uint r = 0; r < train.relation.dim; r++) { + train.relation(r).data->attr_offset = num_all_attribute; + num_all_attribute += train.relation(r).data->num_feature; + } + DataMetaInfo meta(num_all_attribute); + { + meta.num_attr_groups = meta_main.num_attr_groups; + for (uint r = 0; r < relation.dim; r++) { + meta.num_attr_groups += relation(r)->meta->num_attr_groups; + } + meta.num_attr_per_group.setSize(meta.num_attr_groups); + meta.num_attr_per_group.init(0); + for (uint i = 0; i < meta_main.attr_group.dim; i++) { + meta.attr_group(i) = meta_main.attr_group(i); + meta.num_attr_per_group(meta.attr_group(i))++; + } + + uint attr_cntr = meta_main.attr_group.dim; + uint attr_group_cntr = meta_main.num_attr_groups; + for (uint r = 0; r < relation.dim; r++) { + for (uint i = 0; i < relation(r)->meta->attr_group.dim; i++) { + meta.attr_group(i+attr_cntr) = attr_group_cntr + relation(r)->meta->attr_group(i); + meta.num_attr_per_group(attr_group_cntr + relation(r)->meta->attr_group(i))++; + } + attr_cntr += relation(r)->meta->attr_group.dim; + attr_group_cntr += relation(r)->meta->num_attr_groups; + } + if (cmdline.getValue(param_verbosity, 0) > 0) { meta.debug(); } + } + meta.num_relations = train.relation.dim; + + // (2) Setup the factorization machine + fm_model fm; + { + fm.num_attribute = num_all_attribute; + fm.init_stdev = cmdline.getValue(param_init_stdev, 0.1); + // set the number of dimensions in the factorization + { + vector dim = cmdline.getIntValues(param_dim); + assert(dim.size() == 3); + fm.k0 = dim[0] != 0; + fm.k1 = dim[1] != 0; + fm.num_factor = dim[2]; + } + fm.init(); + + } + + // (2.1) load the FM model + if (cmdline.hasParameter(param_load_model)) { + std::cout << "Reading FM model... \t" << std::endl; + if(!fm.loadModel(cmdline.getValue(param_load_model))){ + std::cout << "WARNING: malformed model file. Nothing will be loaded." << std::endl; + fm.init(); + } + } + + // (3) Setup the learning method: + fm_learn* fml; + if (! cmdline.getValue(param_method).compare("sgd")) { + fml = new fm_learn_sgd_element(); + ((fm_learn_sgd*)fml)->num_iter = cmdline.getValue(param_num_iter, 100); + + } else if (! cmdline.getValue(param_method).compare("sgda")) { + assert(validation != NULL); + fml = new fm_learn_sgd_element_adapt_reg(); + ((fm_learn_sgd*)fml)->num_iter = cmdline.getValue(param_num_iter, 100); + ((fm_learn_sgd_element_adapt_reg*)fml)->validation = validation; + + } else if (! cmdline.getValue(param_method).compare("mcmc")) { + fm.w.init_normal(fm.init_mean, fm.init_stdev); + fml = new fm_learn_mcmc_simultaneous(); + fml->validation = validation; + ((fm_learn_mcmc*)fml)->num_iter = cmdline.getValue(param_num_iter, 100); + ((fm_learn_mcmc*)fml)->num_eval_cases = cmdline.getValue(param_num_eval_cases, test.num_cases); + + ((fm_learn_mcmc*)fml)->do_sample = cmdline.getValue(param_do_sampling, true); + ((fm_learn_mcmc*)fml)->do_multilevel = cmdline.getValue(param_do_multilevel, true); + } else { + throw "unknown method"; + } + fml->fm = &fm; + fml->max_target = train.max_target; + fml->min_target = train.min_target; + fml->meta = &meta; + if (! cmdline.getValue("task").compare("r") ) { + fml->task = 0; + } else if (! cmdline.getValue("task").compare("c") ) { + fml->task = 1; + for (uint i = 0; i < train.target.dim; i++) { if (train.target(i) <= 0.0) { train.target(i) = -1.0; } else {train.target(i) = 1.0; } } + for (uint i = 0; i < test.target.dim; i++) { if (test.target(i) <= 0.0) { test.target(i) = -1.0; } else {test.target(i) = 1.0; } } + if (validation != NULL) { + for (uint i = 0; i < validation->target.dim; i++) { if (validation->target(i) <= 0.0) { validation->target(i) = -1.0; } else {validation->target(i) = 1.0; } } + } + } else { + throw "unknown task"; + } + + // (4) init the logging + RLog* rlog = NULL; + if (cmdline.hasParameter(param_r_log)) { + ofstream* out_rlog = NULL; + std::string r_log_str = cmdline.getValue(param_r_log); + out_rlog = new ofstream(r_log_str.c_str()); + if (! out_rlog->is_open()) { + throw "Unable to open file " + r_log_str; + } + std::cout << "logging to " << r_log_str.c_str() << std::endl; + rlog = new RLog(out_rlog); + } + + fml->log = rlog; + fml->init(); + if (! cmdline.getValue(param_method).compare("mcmc")) { + // set the regularization; for als and mcmc this can be individual per group + { + vector reg = cmdline.getDblValues(param_regular); + assert((reg.size() == 0) || (reg.size() == 1) || (reg.size() == 3) || (reg.size() == (1+meta.num_attr_groups*2))); + if (reg.size() == 0) { + fm.reg0 = 0.0; + fm.regw = 0.0; + fm.regv = 0.0; + ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw); + ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv); + } else if (reg.size() == 1) { + fm.reg0 = reg[0]; + fm.regw = reg[0]; + fm.regv = reg[0]; + ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw); + ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv); + } else if (reg.size() == 3) { + fm.reg0 = reg[0]; + fm.regw = reg[1]; + fm.regv = reg[2]; + ((fm_learn_mcmc*)fml)->w_lambda.init(fm.regw); + ((fm_learn_mcmc*)fml)->v_lambda.init(fm.regv); + } else { + fm.reg0 = reg[0]; + fm.regw = 0.0; + fm.regv = 0.0; + int j = 1; + for (uint g = 0; g < meta.num_attr_groups; g++) { + ((fm_learn_mcmc*)fml)->w_lambda(g) = reg[j]; + j++; + } + for (uint g = 0; g < meta.num_attr_groups; g++) { + for (int f = 0; f < fm.num_factor; f++) { + ((fm_learn_mcmc*)fml)->v_lambda(g,f) = reg[j]; + } + j++; + } + } + } + } else { + // set the regularization; for standard SGD, groups are not supported + { + vector reg = cmdline.getDblValues(param_regular); + assert((reg.size() == 0) || (reg.size() == 1) || (reg.size() == 3)); + if (reg.size() == 0) { + fm.reg0 = 0.0; + fm.regw = 0.0; + fm.regv = 0.0; + } else if (reg.size() == 1) { + fm.reg0 = reg[0]; + fm.regw = reg[0]; + fm.regv = reg[0]; + } else { + fm.reg0 = reg[0]; + fm.regw = reg[1]; + fm.regv = reg[2]; + } + } + } + { + fm_learn_sgd* fmlsgd= dynamic_cast(fml); + if (fmlsgd) { + // set the learning rates (individual per layer) + { + vector lr = cmdline.getDblValues(param_learn_rate); + assert((lr.size() == 1) || (lr.size() == 3)); + if (lr.size() == 1) { + fmlsgd->learn_rate = lr[0]; + fmlsgd->learn_rates.init(lr[0]); + } else { + fmlsgd->learn_rate = 0; + fmlsgd->learn_rates(0) = lr[0]; + fmlsgd->learn_rates(1) = lr[1]; + fmlsgd->learn_rates(2) = lr[2]; + } + } + } + } + if (rlog != NULL) { + rlog->init(); + } + + if (cmdline.getValue(param_verbosity, 0) > 0) { + fm.debug(); + fml->debug(); + } + + // () learn + fml->learn(train, test); + + // () Prediction at the end (not for mcmc and als) + if (cmdline.getValue(param_method).compare("mcmc")) { + std::cout << "Final\t" << "Train=" << fml->evaluate(train) << "\tTest=" << fml->evaluate(test) << std::endl; + } + + // () Save prediction + if (cmdline.hasParameter(param_out)) { + DVector pred; + pred.setSize(test.num_cases); + fml->predict(test, pred); + pred.save(cmdline.getValue(param_out)); + } + + // () save the FM model + if (cmdline.hasParameter(param_save_model)) { + std::cout << "Writing FM model to "<< cmdline.getValue(param_save_model) << std::endl; + fm.saveModel(cmdline.getValue(param_save_model)); + } + + } catch (std::string &e) { + std::cerr << std::endl << "ERROR: " << e << std::endl; + } catch (char const* &e) { + std::cerr << std::endl << "ERROR: " << e << std::endl; + } +} diff --git a/cornac/models/fm/libfm/libfm/tools/convert.cpp b/cornac/models/fm/libfm/libfm/tools/convert.cpp new file mode 100644 index 000000000..2101c474b --- /dev/null +++ b/cornac/models/fm/libfm/libfm/tools/convert.cpp @@ -0,0 +1,205 @@ +// Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle +// Contact: srendle@libfm.org, http://www.libfm.org/ +// +// This file is part of libFM. +// +// libFM is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// libFM is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with libFM. If not, see . +// +// +// transpose: Convert a libfm-format file in a binary sparse matrix for x and +// a dense vector for the target y. + +#include +#include +#include +#include +#include +#include +#include +#include "../../util/util.h" +#include "../../util/cmdline.h" +#include "../src/Data.h" + +/** + * + * Version history: + * 1.4.2: + * changed license to GPLv3 + * 1.4.0: + * no differences, version numbers are kept in sync over all libfm tools + * 1.3.6: + * binary mode for file access + * 1.3.4: + * no differences, version numbers are kept in sync over all libfm tools + * 1.3.2: + * reading without token reader class + * 1.0: + * first version + */ + + + +using namespace std; + +int main(int argc, char **argv) { + + srand ( time(NULL) ); + try { + CMDLine cmdline(argc, argv); + std::cout << "----------------------------------------------------------------------------" << std::endl; + std::cout << "Convert" << std::endl; + std::cout << " Version: 1.4.2" << std::endl; + std::cout << " Author: Steffen Rendle, srendle@libfm.org" << std::endl; + std::cout << " WWW: http://www.libfm.org/" << std::endl; + std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl; + std::cout << "This is free software, and you are welcome to redistribute it under certain" << std::endl; + std::cout << "conditions; for details see license.txt." << std::endl; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + const std::string param_ifile = cmdline.registerParameter("ifile", "input file name, file has to be in binary sparse format [MANDATORY]"); + const std::string param_ofilex = cmdline.registerParameter("ofilex", "output file name for x [MANDATORY]"); + const std::string param_ofiley = cmdline.registerParameter("ofiley", "output file name for y [MANDATORY]"); + const std::string param_help = cmdline.registerParameter("help", "this screen"); + + + if (cmdline.hasParameter(param_help) || (argc == 1)) { + cmdline.print_help(); + return 0; + } + cmdline.checkParameters(); + + std::string ifile = cmdline.getValue(param_ifile); + std::string ofilex = cmdline.getValue(param_ofilex); + std::string ofiley = cmdline.getValue(param_ofiley); + + uint num_rows = 0; + uint64 num_values = 0; + uint num_feature = 0; + bool has_feature = false; + DATA_FLOAT min_target = +std::numeric_limits::max(); + DATA_FLOAT max_target = -std::numeric_limits::max(); + + // (1) determine the number of rows and the maximum feature_id + { + std::ifstream fData(ifile.c_str()); + if (! fData.is_open()) { + throw "unable to open " + ifile; + } + DATA_FLOAT _value; + int nchar; + uint _feature; + while (!fData.eof()) { + std::string line; + std::getline(fData, line); + const char *pline = line.c_str(); + while ((*pline == ' ') || (*pline == 9)) { pline++; } // skip leading spaces + if ((*pline == 0) || (*pline == '#')) { continue; } // skip empty rows + if (sscanf(pline, "%f%n", &_value, &nchar) >=1) { + pline += nchar; + min_target = std::min(_value, min_target); + max_target = std::max(_value, max_target); + num_rows++; + while (sscanf(pline, "%d:%f%n", &_feature, &_value, &nchar) >= 2) { + pline += nchar; + num_feature = std::max(_feature, num_feature); + has_feature = true; + num_values++; + } + while ((*pline != 0) && ((*pline == ' ') || (*pline == 9))) { pline++; } // skip trailing spaces + if ((*pline != 0) && (*pline != '#')) { + throw "cannot parse line \"" + line + "\" at character " + pline[0]; + } + } else { + throw "cannot parse line \"" + line + "\" at character " + pline[0]; + } + } + fData.close(); + } + if (has_feature) { + num_feature++; // number of feature is bigger (by one) than the largest value + } + std::cout << "num_rows=" << num_rows << "\tnum_values=" << num_values << "\tnum_features=" << num_feature << "\tmin_target=" << min_target << "\tmax_target=" << max_target << std::endl; + + sparse_row row; + row.data = new sparse_entry[num_feature]; + + // (2) read the data and write it back simultaneously + { + std::ifstream fData(ifile.c_str()); + if (! fData.is_open()) { + throw "unable to open " + ifile; + } + std::ofstream out_x(ofilex.c_str(), ios_base::out | ios_base::binary); + if (! out_x.is_open()) { + throw "unable to open " + ofilex; + } else { + file_header fh; + fh.id = FMATRIX_EXPECTED_FILE_ID; + fh.num_values = num_values; + fh.num_rows = num_rows; + fh.num_cols = num_feature; + fh.float_size = sizeof(DATA_FLOAT); + out_x.write(reinterpret_cast(&fh), sizeof(fh)); + } + std::ofstream out_y(ofiley.c_str(), ios_base::out | ios_base::binary); + if (! out_y.is_open()) { + throw "unable to open " + ofiley; + } else { + uint file_version = 1; + uint data_size = sizeof(DATA_FLOAT); + out_y.write(reinterpret_cast(&file_version), sizeof(file_version)); + out_y.write(reinterpret_cast(&data_size), sizeof(data_size)); + out_y.write(reinterpret_cast(&num_rows), sizeof(num_rows)); + } + + DATA_FLOAT _value; + int nchar; + uint _feature; + while (!fData.eof()) { + std::string line; + std::getline(fData, line); + const char *pline = line.c_str(); + while ((*pline == ' ') || (*pline == 9)) { pline++; } // skip leading spaces + if ((*pline == 0) || (*pline == '#')) { continue; } // skip empty rows + if (sscanf(pline, "%f%n", &_value, &nchar) >=1) { + pline += nchar; + out_y.write(reinterpret_cast(&(_value)), sizeof(DATA_FLOAT)); + row.size = 0; + while (sscanf(pline, "%d:%f%n", &_feature, &_value, &nchar) >= 2) { + pline += nchar; + assert(row.size < num_feature); + row.data[row.size].id = _feature; + row.data[row.size].value = _value; + row.size++; + } + out_x.write(reinterpret_cast(&(row.size)), sizeof(uint)); + out_x.write(reinterpret_cast(row.data), sizeof(sparse_entry)*row.size); + while ((*pline != 0) && ((*pline == ' ') || (*pline == 9))) { pline++; } // skip trailing spaces + if ((*pline != 0) && (*pline != '#')) { + throw "cannot parse line \"" + line + "\" at character " + pline[0]; + } + } else { + throw "cannot parse line \"" + line + "\" at character " + pline[0]; + } + } + fData.close(); + out_x.close(); + out_y.close(); + + } + } catch (std::string &e) { + std::cerr << e << std::endl; + } + +} diff --git a/cornac/models/fm/libfm/libfm/tools/transpose.cpp b/cornac/models/fm/libfm/libfm/tools/transpose.cpp new file mode 100644 index 000000000..0ac85c37d --- /dev/null +++ b/cornac/models/fm/libfm/libfm/tools/transpose.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2011, 2012, 2013, 2014 Steffen Rendle +// Contact: srendle@libfm.org, http://www.libfm.org/ +// +// This file is part of libFM. +// +// libFM is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// libFM is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with libFM. If not, see . +// +// +// transpose: Transposes a matrix in binary sparse format. + +#include +#include +#include +#include +#include +#include +#include +#include "../../util/util.h" +#include "../../util/cmdline.h" +#include "../src/Data.h" + +/** + * + * Version history: + * 1.4.2: + * changed license to GPLv3 + * 1.4.0: + * default cache size is 200 MB + * 1.3.6: + * binary mode for file access + * 1.3.4: + * no differences, version numbers are kept in sync over all libfm tools + * 1.3.2: + * no differences, version numbers are kept in sync over all libfm tools + * 1.0: + * first version + */ + + +using namespace std; + +int main(int argc, char **argv) { + + srand ( time(NULL) ); + try { + CMDLine cmdline(argc, argv); + std::cout << "----------------------------------------------------------------------------" << std::endl; + std::cout << "Transpose" << std::endl; + std::cout << " Version: 1.4.2" << std::endl; + std::cout << " Author: Steffen Rendle, srendle@libfm.org" << std::endl; + std::cout << " WWW: http://www.libfm.org/" << std::endl; + std::cout << "This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt." << std::endl; + std::cout << "This is free software, and you are welcome to redistribute it under certain" << std::endl; + std::cout << "conditions; for details see license.txt." << std::endl; + std::cout << "----------------------------------------------------------------------------" << std::endl; + + const std::string param_ifile = cmdline.registerParameter("ifile", "input file name, file has to be in binary sparse format [MANDATORY]"); + const std::string param_ofile = cmdline.registerParameter("ofile", "output file name [MANDATORY]"); + + const std::string param_cache_size = cmdline.registerParameter("cache_size", "cache size for data storage, default=200000000"); + const std::string param_help = cmdline.registerParameter("help", "this screen"); + + + if (cmdline.hasParameter(param_help) || (argc == 1)) { + cmdline.print_help(); + return 0; + } + cmdline.checkParameters(); + + + // (1) Load the data + long long cache_size = cmdline.getValue(param_cache_size, 200000000); + cache_size /= 2; + LargeSparseMatrixHD d_in(cmdline.getValue(param_ifile), cache_size); + std::cout << "num_rows=" << d_in.getNumRows() << "\tnum_values=" << d_in.getNumValues() << "\tnum_features=" << d_in.getNumCols() << std::endl; + + // (2) transpose the data + // (2.1) count how many entries per col (=transpose-row) there are: + DVector entries_per_col(d_in.getNumCols()); + entries_per_col.init(0); + for (d_in.begin(); !d_in.end(); d_in.next() ) { + sparse_row& row = d_in.getRow(); + for (uint j = 0; j < row.size; j++) { + entries_per_col(row.data[j].id)++; + } + } + // (2.2) build a + std::string ofile = cmdline.getValue(param_ofile); + std::cout << "output to " << ofile << std::endl; std::cout.flush(); + std::ofstream out(ofile.c_str(), ios_base::out | ios_base::binary); + if (out.is_open()) { + file_header fh; + fh.id = FMATRIX_EXPECTED_FILE_ID; + fh.num_values = d_in.getNumValues(); + fh.num_rows = d_in.getNumCols(); + fh.num_cols = d_in.getNumRows(); + fh.float_size = sizeof(DATA_FLOAT); + out.write(reinterpret_cast(&fh), sizeof(fh)); + + DVector< sparse_row > out_row_cache; + DVector< sparse_entry > out_entry_cache; + { + // determine cache sizes automatically: + double avg_entries_per_line = (double) d_in.getNumValues() / d_in.getNumCols(); + uint num_rows_in_cache = cache_size / (sizeof(sparse_entry) * avg_entries_per_line + sizeof(uint)); + num_rows_in_cache = std::min(num_rows_in_cache, d_in.getNumCols()); + uint64 num_entries_in_cache = (cache_size - sizeof(uint)*num_rows_in_cache) / sizeof(sparse_entry); + num_entries_in_cache = std::min(num_entries_in_cache, d_in.getNumValues()); + std::cout << "num entries in cache=" << num_entries_in_cache << "\tnum rows in cache=" << num_rows_in_cache << std::endl; + out_entry_cache.setSize(num_entries_in_cache); + out_row_cache.setSize(num_rows_in_cache); + } + + uint out_cache_col_position = 0; // the first column id that is in cache + uint out_cache_col_num = 0; // how many columns are in the cache + + while (out_cache_col_position < d_in.getNumCols()) { + // assign cache sizes + { + uint entry_cache_pos = 0; + // while (there is enough space in the entry cache for the next row) and (there is space for another row) and (there is another row in the data) do + while (((entry_cache_pos + entries_per_col(out_cache_col_position + out_cache_col_num)) < out_entry_cache.dim) && ((out_cache_col_num+1) < out_row_cache.dim) && ((out_cache_col_position+out_cache_col_num) < d_in.getNumCols())) { + out_row_cache(out_cache_col_num).size = 0; + out_row_cache(out_cache_col_num).data = &(out_entry_cache.value[entry_cache_pos]); + entry_cache_pos += entries_per_col(out_cache_col_position + out_cache_col_num); + out_cache_col_num++; + } + } + assert(out_cache_col_num > 0); + // fill the cache + for (d_in.begin(); !d_in.end(); d_in.next() ) { + sparse_row& row = d_in.getRow(); + for (uint j = 0; j < row.size; j++) { + if ((row.data[j].id >= out_cache_col_position) && (row.data[j].id < (out_cache_col_position+out_cache_col_num))) { + uint cache_row_index = row.data[j].id-out_cache_col_position; + out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].id = d_in.getRowIndex(); + out_row_cache(cache_row_index).data[out_row_cache(cache_row_index).size].value = row.data[j].value; + out_row_cache(cache_row_index).size++; + } + } + } + + for (uint i = 0; i < out_cache_col_num; i++) { + assert(out_row_cache(i).size == entries_per_col(i + out_cache_col_position)); + out.write(reinterpret_cast(&(out_row_cache(i).size)), sizeof(uint)); + out.write(reinterpret_cast(out_row_cache(i).data), sizeof(sparse_entry)*out_row_cache(i).size); + } + out_cache_col_position += out_cache_col_num; + out_cache_col_num = 0; + } + out.close(); + } else { + throw "could not open " + ofile; + } + + } catch (std::string &e) { + std::cerr << e << std::endl; + } +}