_bibliography/papers.bib

-
-

@article{xie2020,
  title         = {Pruned {{Wasserstein Index Generation Model}} and Wigpy {{Package}}},
  author        = {Xie, Fangzhou},
  year          = {2020},
  month         = jul,
  journal       = {arXiv:2004.00999 [cs, econ, q-fin]},
  eprint        = {2004.00999},
  primaryclass  = {cs, econ, q-fin},
  url           = {http://arxiv.org/abs/2004.00999},
  urldate       = {2020-07-27},
  abstract      = {Recent proposal of Wasserstein Index Generation model (WIG) has shown a new direction for automatically generating indices. However, it is challenging in practice to fit large datasets for two reasons. First, the Sinkhorn distance is notoriously expensive to compute and suffers from dimensionality severely. Second, it requires to compute a full \$N\textbackslash times N\$ matrix to be fit into memory, where \$N\$ is the dimension of vocabulary. When the dimensionality is too large, it is even impossible to compute at all. I hereby propose a Lasso-based shrinkage method to reduce dimensionality for the vocabulary as a pre-processing step prior to fitting the WIG model. After we get the word embedding from Word2Vec model, we could cluster these high-dimensional vectors by \$k\$-means clustering, and pick most frequent tokens within each cluster to form the "base vocabulary". Non-base tokens are then regressed on the vectors of base token to get a transformation weight and we could thus represent the whole vocabulary by only the "base tokens". This variant, called pruned WIG (pWIG), will enable us to shrink vocabulary dimension at will but could still achieve high accuracy. I also provide a \textbackslash textit\{wigpy\} module in Python to carry out computation in both flavor. Application to Economic Policy Uncertainty (EPU) index is showcased as comparison with existing methods of generating time-series sentiment indices.},
  archiveprefix = {arxiv},
  copyright     = {All rights reserved},
  keywords      = {Computer Science - Computation and Language,Computer Science - Machine Learning,Economics - General Economics}
}

@article{xie2020a,
  title             = {Wasserstein {{Index Generation Model}}: {{Automatic}} Generation of Time-Series Index with Application to {{Economic Policy Uncertainty}}},
  shorttitle        = {Wasserstein {{Index Generation Model}}},
  author            = {Xie, Fangzhou},
  year              = {2020},
  month             = jan,
  journal           = {Economics Letters},
  volume            = {186},
  pages             = {108874},
  issn              = {0165-1765},
  doi               = {10.1016/j.econlet.2019.108874},
  url               = {http://www.sciencedirect.com/science/article/pii/S0165176519304410},
  urldate           = {2019-12-10},
  abstract          = {I propose a novel method, the Wasserstein Index Generation model (WIG), to generate a public sentiment index automatically. To test the model's effectiveness, an application to generate Economic Policy Uncertainty (EPU) index is showcased.},
  copyright         = {All rights reserved},
  langid            = {english},
  selected          = {true},
  keywords          = {Economic Policy Uncertainty Index (EPU),Singular Value Decomposition (SVD),Wasserstein Dictionary Learning (WDL),Wasserstein Index Generation Model (WIG)},
  abbr              = {Econ. Lett.},
  altmetric         = {true},
  google_scholar_id = {u-x6o8ySG0sC},
  dimensions        = {true}
}

@article{xie2022,
  title             = {Rethnicity: {{An R}} Package for Predicting Ethnicity from Names},
  shorttitle        = {Rethnicity},
  author            = {Xie, Fangzhou},
  year              = {2022},
  month             = jan,
  journal           = {SoftwareX},
  volume            = {17},
  pages             = {100965},
  issn              = {2352-7110},
  doi               = {10.1016/j.softx.2021.100965},
  url               = {https://www.sciencedirect.com/science/article/pii/S2352711021001874},
  urldate           = {2022-01-06},
  abstract          = {In this study, a new R package, rethnicity 11https://github.com/fangzhou-xie/rethnicity. It has also been published on [CRAN]. is provided for predicting ethnicity based on names. The Bidirectional Long Short-Term Memory (Bi-LSTM), a recurrent neural network architecture commonly used for natural language processing, was chosen as the model for our study. The Florida Voter Registration was used as the training and testing data. Special care was given for the accuracy of minority groups by adjusting the imbalance in the dataset. The models were trained and exported to C++ and then integrated with R using Rcpp. Additionally, the availability, accuracy, and performance of the package were compared with other solutions.},
  copyright         = {All rights reserved},
  langid            = {english},
  selected          = {true},
  keywords          = {Ethnicity prediction,LSTM,R},
  abbr              = {SoftwareX},
  altmetric         = {true},
  google_scholar_id = {2osOgNQ5qMEC},
  dimensions        = {true}
}