Commit 98ec66ee by Vik Paruchuri

Add needed files back in

parent e3283669
.idea/
__pycache__/
models/
*.pyc
*~
tests/
_build/
build/
dist/
machine_learning.egg-info/
*.egg
This diff is collapsed. Click to expand it.
ML
====================
Overview
---------------------
This is a repo with functions that can score arbitrary free text and numeric predictors.
This is licensed under the AGPL, please see LICENSE.txt for details.
The goal here is to provide a high-performance, scalable solution that can predict targets from arbitrary values.
Note that this is a library. You will need to implement your own code to make it runnable. The ml-service-api repo in
the edX github organization is an API wrapper for this code. See http://github.com/edx/ml-service-api for more information.
How to Contribute
-----------------------
Contributions are very welcome. The easiest way is to fork this repo, and then make a pull request from your fork.
The current backlog is in the issues section. Please feel free to open new issues or work on existing ones.
Detailed Information
-------------------------
Please look in the docs folder for more detailed documentation. There is a README there that explains how to build
and view the docs.
\ No newline at end of file
python-pip
gfortran
libblas3gf
libblas-dev
liblapack3gf
liblapack-dev
libatlas-base-dev
libxml2-dev
libxslt1-dev
aspell
python
\ No newline at end of file
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
-rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/MLAPI.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/MLAPI.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/MLAPI"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/MLAPI"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
This directory contains documentation that can be built into HTML using sphinx (http://sphinx.pocoo.org/).
Sphinx uses ReST (reStructuredText) as the source for its documentation files.
To create an HTML version of the docs:
* Install Sphinx (``pip install Sphinx``)
* In this directory, type ``make html`` (or ``make.bat html`` on
Windows) at a shell prompt.
The documentation in _build/html/index.html can then be viewed in a web browser.
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# ML API documentation build configuration file, created by
# sphinx-quickstart on Fri Mar 1 09:51:10 2013.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.append(os.path.abspath('.'))
sys.path.append(os.path.abspath('../'))
sys.path.append(os.path.abspath('../machine-learning'))
sys.path.append(os.path.abspath('../../'))
# -- General configuration -----------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'ML'
copyright = u'2013, edX'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '.01'
# The full version, including alpha/beta/rc tags.
release = '.01'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'MLdoc'
# -- Options for LaTeX output --------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'ML.tex', u'ML API Documentation',
u'edX', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'ml', u'ML Documentation',
[u'edX'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output ------------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'ML', u'ML Documentation',
u'edX', 'ML', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}
.. ML documentation master file, created by
sphinx-quickstart on Fri Mar 1 09:51:10 2013.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
ML Documentation
==================================
Overview
---------------------------------
.. toctree::
:maxdepth: 1
overview/description
overview/goals
overview/contributing
Installation and Usage
---------------------------------
.. toctree::
:maxdepth: 1
installation/installation_overview
installation/usage
Module Documentation
---------------------------------
.. toctree::
:maxdepth: 1
project
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
===============================================
Installation Overview
===============================================
Notes on how to install:
1. cd DIRECTORY_YOU_INSTALLED_TO. Make sure that you install to the folder machine-learning!
2. sudo apt-get update
3. sudo apt-get upgrade gcc
4. sudo xargs -a apt-packages.txt apt-get install
5. Activate your virtual env (if you have one)
6. pip install -r pre-requirements.txt
7. pip install -r requirements.txt
8. python -m nltk.downloader maxent_treebank_pos_tagger wordnet
9. sudo mv /path/to/nltk_data /usr/share
==================================
Usage
==================================
This repo offers the ability to create models and to grade new text. There are additional lower-level functions, as well.
Essay Grading
-------------------------------------
Essay grading can be done via the "grade" function in grade.py and the "create" function in create.py. Call the create function, and pass in the appropriate data (see documentation there), in order to obtain a created model. That model can then be used in conjunction with the "grade" function to get scores for new text.
Arbitrary sets of predictors and text scoring
----------------------------------------------------
This repo can also be used to compute scores for arbitrary sets of numeric and textual predictors. For example, you could predict whether the stock market will rise or fall tomorrow by passing in a set of article headlines, article text, and publication times. Use the functions "create_generic" in create.py and "grade_generic" in grade.py to do this.
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\MLAPI.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\MLAPI.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
:end
===============================================
Contributing
===============================================
We welcome contributions! In order to contribute:
1. Find something that would be useful to contribute, or check the github issues tracker to see if there is anything to work on.
2. Fork the repository and add in your new feature.
3. Put in a pull request to merge your feature into this repo.
Please feel free to contact vik@edx.org if you have any questions about contributing.
\ No newline at end of file
===============================================
Description
===============================================
The ML repo allows anyone to use machine-learning based automated classification. This automated classification can work on both free text (essays, content, etc), and on numeric values.
Let's say that you have 10000 user reviews for 15 books (ie "I loved this!", "I didn't like it.", and so on). What you really want to do is use the user reviews to get an aggregate score for each book that indicates how well-received it is. But, in your haste to collect the data, you forgot to get scores from the users. In this case, the text of the user reviews is your predictor, and the score that you want to collect from each user for each book is the target variable.
So, how do you turn the text into numbers? One very straightforward way is to just label each of the reviews by hand on a scale from 0 (the user didn't like it at all) to 5 (they really loved it). But, somewhere around review 200 you are going to start to get very sick of the whole process. A less labor intensive way is to use automated classification.
If you choose to use automated classification for this task, you will score some reasonable subset of the reviews (if you score more, the classification will be more accurate, but 200 should be fine as a baseline). Once you have your subset, which can also be called a "training" set, you will be able to "train" a machine learning model that learns how to map your scores to the text of the reviews. It will then be able to automatically score the rest of the 9800 reviews. Let's say you also want to take the user's activity level into account in order to weight the score. You can add in a numeric predictor in addition to your existing text predictor (the review text itself) in order to predict the target variable (score).
This repo gives you a nice, clean way to do that via convenience functions grade, grade_generic, create, and create_generic.
===============================================
edX Goals
===============================================
Why is this open source? The algorithms and code in this repository offer a fast and flexible way to predict almost anything that you have data for. The goals in open sourcing this are to encourage widespread adoption and to get useful contributions that improve and expand on this code.
\ No newline at end of file
##############
ML
##############
ML Model Creation
-------------------------------------
.. automodule:: create
:members:
ML Grading
--------------------------------------
.. automodule:: grade
:members:
Essay Set
--------------------------------------
.. automodule:: essay_set
:members:
Feature Extractor
--------------------------------------
.. automodule:: feature_extractor
:members:
Predictor Set
--------------------------------------
.. automodule:: predictor_set
:members:
Predictor Extractor
--------------------------------------
.. automodule:: predictor_extractor
:members:
Utility Functions
--------------------------------------
.. automodule:: util_functions
:members:
-r pre-requirements.txt
-r requirements.txt
\ No newline at end of file
#!/usr/bin/env bash
# posix compliant sanity check
if [ -z $BASH ] || [ $BASH = "/bin/sh" ]; then
echo "Please use the bash interpreter to run this script"
exit 1
fi
error() {
printf '\E[31m'; echo "$@"; printf '\E[0m'
}
output() {
printf '\E[36m'; echo "$@"; printf '\E[0m'
}
### START
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
BREW_FILE=$DIR/"brew-formulas.txt"
APT_PKGS_FILE=$DIR/"apt-packages.txt"
case `uname -s` in
[Ll]inux)
command -v lsb_release &>/dev/null || {
error "Please install lsb-release."
exit 1
}
distro=`lsb_release -cs`
case $distro in
maya|lisa|natty|oneiric|precise|quantal)
output "Installing Ubuntu requirements"
# DEBIAN_FRONTEND=noninteractive is required for silent mysql-server installation
export DEBIAN_FRONTEND=noninteractive
# install packages listed in APT_PKGS_FILE
cat $APT_PKGS_FILE | xargs sudo apt-get -y install
;;
*)
error "Unsupported distribution - $distro"
exit 1
;;
esac
;;
Darwin)
if [[ ! -w /usr/local ]]; then
cat<<EO
You need to be able to write to /usr/local for
the installation of brew and brew packages.
Either make sure the group you are in (most likely 'staff')
can write to that directory or simply execute the following
and re-run the script:
$ sudo chown -R $USER /usr/local
EO
exit 1
fi
output "Installing OSX requirements"
if [[ ! -r $BREW_FILE ]]; then
error "$BREW_FILE does not exist, needed to install brew"
exit 1
fi
# brew errors if the package is already installed
for pkg in $(cat $BREW_FILE); do
grep $pkg <(brew list) &>/dev/null || {
output "Installing $pkg"
brew install $pkg
}
done
# paths where brew likes to install python scripts
PATH=/usr/local/share/python:/usr/local/bin:$PATH
command -v pip &>/dev/null || {
output "Installing pip"
easy_install pip
}
if ! grep -Eq ^1.7 <(virtualenv --version 2>/dev/null); then
output "Installing virtualenv >1.7"
pip install 'virtualenv>1.7' virtualenvwrapper
fi
command -v coffee &>/dev/null || {
output "Installing coffee script"
curl --insecure https://npmjs.org/install.sh | sh
npm install -g coffee-script
}
;;
*)
error "Unsupported platform"
exit 1
;;
esac
"""
Functions that create a machine learning model from training data
"""
import os
import sys
import logging
from statsd import statsd
import numpy
#Define base path and add to sys path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
one_up_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..//'))
sys.path.append(one_up_path)
#Import modules that are dependent on the base path
import model_creator
import util_functions
import predictor_set
import predictor_extractor
#Make a log
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.creator.time')
def create(text,score,prompt_string):
"""
Creates a machine learning model from input text, associated scores, a prompt, and a path to the model
TODO: Remove model path argument, it is needed for now to support legacy code
text - A list of strings containing the text of the essays
score - a list of integers containing score values
prompt_string - the common prompt for the set of essays
"""
#Initialize a results dictionary to return
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : util_functions.AlgorithmTypes.classification,
'score' : score, 'text' : text, 'prompt' : prompt_string}
if len(text)!=len(score):
msg = "Target and text lists must be same length."
results['errors'].append(msg)
log.exception(msg)
return results
#Decide what algorithm to use (regression or classification)
try:
#Count the number of unique score points in the score list
if len(util_functions.f7(list(score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
except:
type = util_functions.AlgorithmTypes.regression
try:
#Create an essay set object that encapsulates all the essays and alternate representations (tokens, etc)
e_set = model_creator.create_essay_set(text, score, prompt_string)
except:
msg = "essay set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
#Gets features from the essay set and computes error
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model(e_set, type=type)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['algorithm'] = type
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
def create_generic(numeric_values, textual_values, target, algorithm = util_functions.AlgorithmTypes.regression):
"""
Creates a model from a generic list numeric values and text values
numeric_values - A list of lists that are the predictors
textual_values - A list of lists that are the predictors
(each item in textual_values corresponds to the similarly indexed counterpart in numeric_values)
target - The variable that we are trying to predict. A list of integers.
algorithm - the type of algorithm that will be used
"""
#Initialize a result dictionary to return.
results = {'errors': [],'success' : False, 'cv_kappa' : 0, 'cv_mean_absolute_error': 0,
'feature_ext' : "", 'classifier' : "", 'algorithm' : algorithm}
if len(numeric_values)!=len(textual_values) or len(numeric_values)!=len(target):
msg = "Target, numeric features, and text features must all be the same length."
results['errors'].append(msg)
log.exception(msg)
return results
try:
#Initialize a predictor set object that encapsulates all of the text and numeric predictors
pset = predictor_set.PredictorSet(type="train")
for i in xrange(0, len(numeric_values)):
pset.add_row(numeric_values[i], textual_values[i], target[i])
except:
msg = "predictor set creation failed."
results['errors'].append(msg)
log.exception(msg)
try:
#Extract all features and then train a classifier with the features
feature_ext, classifier, cv_error_results = model_creator.extract_features_and_generate_model_predictors(pset, algorithm)
results['cv_kappa']=cv_error_results['kappa']
results['cv_mean_absolute_error']=cv_error_results['mae']
results['feature_ext']=feature_ext
results['classifier']=classifier
results['success']=True
except:
msg = "feature extraction and model creation failed."
results['errors'].append(msg)
log.exception(msg)
#Count number of successful/unsuccessful creations
statsd.increment("open_ended_assessment.machine_learning.creator_count",
tags=["success:{0}".format(results['success'])])
return results
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
"""
Defines an essay set object, which encapsulates essays from training and test sets.
Performs spell and grammar checking, tokenization, and stemming.
"""
import numpy
import nltk
import sys
import random
import os
import logging
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
MAXIMUM_ESSAY_LENGTH=20000
class EssaySet(object):
def __init__(self, type="train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._score=[]
self._text=[]
self._id=[]
self._clean_text=[]
self._tokens=[]
self._pos=[]
self._clean_stem_text=[]
self._generated = []
self._prompt = ""
self._spelling_errors=[]
self._markup_text=[]
def add_essay(self, essay_text, essay_score, essay_generated=0):
"""
Add new (essay_text,essay_score) pair to the essay set.
essay_text must be a string.
essay_score must be an int.
essay_generated should not be changed by the user.
Returns a confirmation that essay was added.
"""
# Get maximum current essay id, or set to 0 if this is the first essay added
if(len(self._id) > 0):
max_id = max(self._id)
else:
max_id = 0
# Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1
try:
essay_text=essay_text.encode('ascii', 'ignore')
if len(essay_text)<5:
essay_text="Invalid essay."
except:
log.exception("Could not parse essay into ascii.")
try:
#Try conversion of types
essay_score=int(essay_score)
essay_text=str(essay_text)
except:
#Nothing needed here, will return error in any case.
log.exception("Invalid type for essay score : {0} or essay text : {1}".format(type(essay_score),type(essay_text)))
if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
and (essay_generated == 0 or essay_generated == 1):
self._id.append(max_id + 1)
self._score.append(essay_score)
# Clean text by removing non digit/work/punctuation characters
try:
essay_text=str(essay_text.encode('ascii', 'ignore'))
except:
essay_text = (essay_text.decode('utf-8','replace')).encode('ascii','ignore')
cleaned_essay=util_functions.sub_chars(essay_text).lower()
if(len(cleaned_essay)>MAXIMUM_ESSAY_LENGTH):
cleaned_essay=cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
self._text.append(cleaned_essay)
# Spell correct text using aspell
cleaned_text,spell_errors,markup_text=util_functions.spell_correct(self._text[len(self._text) - 1])
self._clean_text.append(cleaned_text)
self._spelling_errors.append(spell_errors)
self._markup_text.append(markup_text)
# Tokenize text
self._tokens.append(nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1]))
# Part of speech tag text
self._pos.append(nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" ")))
self._generated.append(essay_generated)
# Stem spell corrected text
porter = nltk.PorterStemmer()
por_toks = " ".join([porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
self._clean_stem_text.append(por_toks)
ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score)
else:
raise util_functions.InputError(essay_text, "arguments need to be in format "
"(text,score). text needs to be string,"
" score needs to be int.")
def update_prompt(self, prompt_text):
"""
Update the default prompt string, which is "".
prompt_text should be a string.
Returns the prompt as a confirmation.
"""
if(type(prompt_text) == type("text")):
self._prompt = util_functions.sub_chars(prompt_text)
ret = self._prompt
else:
raise util_functions.InputError(prompt_text, "Invalid prompt. Need to enter a string value.")
return ret
def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3):
"""
Substitute synonyms to generate extra essays from existing ones.
This is done to increase the amount of training data.
Should only be used with lowest scoring essays.
e_text is the text of the original essay.
e_score is the score of the original essay.
dict is a fixed dictionary (list) of words to replace.
max_syns defines the maximum number of additional essays to generate. Do not set too high.
"""
random.seed(1)
e_toks = nltk.word_tokenize(e_text)
all_syns = []
for word in e_toks:
synonyms = util_functions.get_wordnet_syns(word)
if(len(synonyms) > max_syns):
synonyms = random.sample(synonyms, max_syns)
all_syns.append(synonyms)
new_essays = []
for i in range(0, max_syns):
syn_toks = e_toks
for z in range(0, len(e_toks)):
if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict):
syn_toks[z] = all_syns[z][i]
new_essays.append(" ".join(syn_toks))
for z in xrange(0, len(new_essays)):
self.add_essay(new_essays[z], e_score, 1)
\ No newline at end of file
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
The names of its contributors may not be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEET SUKUMARAN OR MARK T. HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
#! /usr/bin/env python
##############################################################################
# Following functions have been taken from the DendroPy library from:
##
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.txt" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
import math
## From dendropy.mathlib.probability
def hypergeometric_pmf(x, m, n, k):
"""
Given a population consisting of `m` items of class M and `n` items of class N,
this returns the probability of observing `x` items of class M when sampling
`k` times without replacement from the entire population (i.e., {M,N})
p(x) = (choose(m, x) * choose(n, k-x)) / choose(m+n, k)
"""
# following fails with 'OverflowError: long int too large to convert to
# float' with large numbers
# return float(binomial_coefficient(m, x) * binomial_coefficient(n, k-x))/binomial_coefficient(m+n, k)
a = math.log(binomial_coefficient(m, x))
b = math.log(binomial_coefficient(n, k-x))
c = math.log(binomial_coefficient(m+n, k))
return math.exp(a+b-c)
## From dendropy.mathlib.probability
def binomial_coefficient(population, sample):
"Returns `population` choose `sample`."
s = max(sample, population - sample)
assert s <= population
assert population > -1
if s == population:
return 1
numerator = 1
denominator = 1
for i in xrange(s+1, population + 1):
numerator *= i
denominator *= (i - s)
return numerator/denominator
## From dendropy.mathlib.statistics
class FishersExactTest(object):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this calculates the sum of the probability of this table and all others
more extreme under the null hypothesis that there is no association between
the categories represented by the vertical and horizontal axes.
"""
def probability_of_table(table):
"""
Given a 2x2 table:
+---+---+
| a | b |
+---+---+
| c | d |
+---+---+
represented by a list of lists::
[[a,b],[c,d]]
this returns the probability of this table under the null hypothesis of
no association between rows and columns, which was shown by Fisher to be
a hypergeometric distribution:
p = ( choose(a+b, a) * choose(c+d, c) ) / choose(a+b+c+d, a+c)
"""
a = table[0][0]
b = table[0][1]
c = table[1][0]
d = table[1][1]
return hypergeometric_pmf(a, a+b, c+d, a+c)
probability_of_table = staticmethod(probability_of_table)
def __init__(self, table):
self.table = table
self.flat_table = [table[0][0], table[0][1], table[1][0], table[1][1]]
self.min_value = min(self.flat_table)
self.max_value = max(self.flat_table)
def _rotate_cw(self, table):
"""
Returns a copy of table such that all the values
are rotated clockwise once.
"""
return [ [ table[1][0], table[0][0] ],
[table[1][1], table[0][1] ] ]
def _min_rotation(self):
"""
Returns copy of self.table such that the smallest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.min_value:
table = self._rotate_cw(table)
return table
def _max_rotation(self):
"""
Returns copy of self.table such that the largest value is in the first
(upper left) cell.
"""
table = [list(self.table[0]), list(self.table[1])]
while table[0][0] != self.max_value:
table = self._rotate_cw(table)
return table
def _sum_left_tail(self):
# left_tail_tables = self._get_left_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in left_tail_tables ]
p_vals = self._get_left_tail_probs()
return sum(p_vals)
def _sum_right_tail(self):
# right_tail_tables = self._get_right_tail_tables()
# p_vals = [ self.probability_of_table(t) for t in right_tail_tables ]
p_vals = self._get_right_tail_probs()
return sum(p_vals)
def _get_left_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_right_tail_probs(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
p_vals = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
p_vals.append(self.probability_of_table(table))
return p_vals
def _get_left_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
left_tail_tables = []
while True:
table[0][0] -= 1
if table[0][0] < 0:
break
table[0][1] = row_totals[0] - table[0][0]
table[1][0] = col_totals[0] - table[0][0]
table[1][1] = row_totals[1] - table[1][0]
left_tail_tables.append([list(table[0]), list(table[1])])
return left_tail_tables
def _get_right_tail_tables(self):
table = self._min_rotation()
row_totals = [sum(table[0]), sum(table[1])]
col_totals = [table[0][0] + table[1][0], table[0][1] + table[1][1]]
right_tail_tables = []
while True:
table[0][0] += 1
table[0][1] = row_totals[0] - table[0][0]
if table[0][1] < 0:
break
table[1][0] = col_totals[0] - table[0][0]
if table[1][0] < 0:
break
table[1][1] = row_totals[1] - table[1][0]
if table[1][1] < 0:
break
right_tail_tables.append([list(table[0]), list(table[1])])
return right_tail_tables
def left_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_left_tail()
def right_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
return self.probability_of_table(self.table) + self._sum_right_tail()
def two_tail_p(self):
"""
Returns the sum of probabilities of this table and all others more
extreme.
"""
p0 = self.probability_of_table(self.table)
all_p_vals = self._get_left_tail_probs() + self._get_right_tail_probs()
p_vals = []
for p in all_p_vals:
if p <= p0:
p_vals.append(p)
return sum(p_vals) + p0
def assert_almost_equal(v1, v2, prec=8):
if abs(v1-v2) <= 10**(-prec):
print "OK: {} == {}".format(v1, v2)
else:
print "FAIL: {} != {}".format(v1, v2)
if __name__ == "__main__":
table = [[12, 5], [29, 2]]
ft = FishersExactTest(table)
assert_almost_equal(ft.left_tail_p(), 0.044554737835078267)
assert_almost_equal(ft.right_tail_p(), 0.99452520602190897)
assert_almost_equal(ft.two_tail_p(), 0.08026855207410688)
\ No newline at end of file
"""
Functions to score specified data using specified ML models
"""
import sys
import pickle
import os
import numpy
import logging
from statsd import statsd
#Append sys to base path to import the following modules
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
#Depend on base path to be imported
from essay_set import EssaySet
import predictor_extractor
import predictor_set
import util_functions
#Imports needed to unpickle grader data
import feature_extractor
import sklearn.ensemble
import math
log = logging.getLogger(__name__)
@statsd.timed('open_ended_assessment.machine_learning.grader.time')
def grade(grader_data,submission):
"""
Grades a specified submission using specified models
grader_data - A dictionary:
{
'model' : trained model,
'extractor' : trained feature extractor,
'prompt' : prompt for the question,
'algorithm' : algorithm for the question,
}
submission - The student submission (string)
"""
#Initialize result dictionary
results = {'errors': [],'tests': [],'score': 0, 'feedback' : "", 'success' : False, 'confidence' : 0}
has_error=False
grader_set=EssaySet(type="test")
#This is to preserve legacy functionality
if 'algorithm' not in grader_data:
grader_data['algorithm'] = util_functions.AlgorithmTypes.classification
try:
#Try to add essay to essay set object
grader_set.add_essay(str(submission),0)
grader_set.update_prompt(str(grader_data['prompt']))
except:
results['errors'].append("Essay could not be added to essay set:{0}".format(submission))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
feedback=grader_data['extractor'].gen_feedback(grader_set,grader_feats)[0]
results['score']=int(grader_data['model'].predict(grader_feats)[0])
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'], grader_data['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
if not has_error:
#If the essay is just a copy of the prompt, return a 0 as the score
if(feedback['too_similar_to_prompt']):
results['score']=0
results['correct']=False
results['success']=True
#Generate short form output--number of problem areas identified in feedback
#Add feedback to results if available
results['feedback'] = {}
if 'topicality' in feedback and 'prompt_overlap' in feedback:
results['feedback'].update({
'topicality' : feedback['topicality'],
'prompt-overlap' : feedback['prompt_overlap'],
})
results['feedback'].update(
{
'spelling' : feedback['spelling'],
'grammar' : feedback['grammar'],
'markup-text' : feedback['markup_text'],
}
)
else:
#If error, success is False.
results['success']=False
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
return results
def grade_generic(grader_data, numeric_features, textual_features):
"""
Grades a set of numeric and textual features using a generic model
grader_data -- dictionary containing:
{
'algorithm' - Type of algorithm to use to score
}
numeric_features - list of numeric features to predict on
textual_features - list of textual feature to predict on
"""
results = {'errors': [],'tests': [],'score': 0, 'success' : False, 'confidence' : 0}
has_error=False
#Try to find and load the model file
grader_set=predictor_set.PredictorSet(type="test")
#Try to add essays to essay set object
try:
grader_set.add_row(numeric_features, textual_features,0)
except:
results['errors'].append("Row could not be added to predictor set:{0} {1}".format(numeric_features, textual_features))
has_error=True
#Try to extract features from submission and assign score via the model
try:
grader_feats=grader_data['extractor'].gen_feats(grader_set)
results['score']=grader_data['model'].predict(grader_feats)[0]
except :
results['errors'].append("Could not extract features and score essay.")
has_error=True
#Try to determine confidence level
try:
results['confidence'] = get_confidence_value(grader_data['algorithm'], grader_data['model'], grader_feats, results['score'])
except:
#If there is an error getting confidence, it is not a show-stopper, so just log
log.exception("Problem generating confidence value")
#Count number of successful/unsuccessful gradings
statsd.increment("open_ended_assessment.machine_learning.grader_count",
tags=["success:{0}".format(results['success'])])
if not has_error:
results['success'] = True
return results
def get_confidence_value(algorithm,model,grader_feats,score, scores):
"""
Determines a confidence in a certain score, given proper input parameters
algorithm- from util_functions.AlgorithmTypes
model - a trained model
grader_feats - a row of features used by the model for classification/regression
score - The score assigned to the submission by a prior model
"""
min_score=min(numpy.asarray(scores))
max_score=max(numpy.asarray(scores))
if algorithm == util_functions.AlgorithmTypes.classification:
#If classification, predict with probability, which gives you a matrix of confidences per score point
raw_confidence=model.predict_proba(grader_feats)[0,(score-min_score)]
#TODO: Normalize confidence somehow here
confidence=raw_confidence
else:
raw_confidence = model.predict(grader_feats)[0]
confidence = max(raw_confidence - math.floor(raw_confidence), math.ceil(raw_confidence) - raw_confidence)
return confidence
#Provides interface functions to create and save models
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
import sklearn.ensemble
from itertools import chain
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
import feature_extractor
import logging
import predictor_extractor
log=logging.getLogger()
def read_in_test_data(filename):
"""
Reads in test data file found at filename.
filename must be a tab delimited file with columns id, dummy number column, score, dummy score, text
returns the score and the text
"""
id, e_set, score, score2, text = [], [], [], [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)):
id1, set1, score1, score12, text1 = raw_lines[row].strip().split("\t")
id.append(int(id1))
text.append(text1)
e_set.append(int(set1))
score.append(int(score1))
score2.append(int(score12))
return score, text
def read_in_test_prompt(filename):
"""
Reads in the prompt from a text file
Returns string
"""
prompt_string = open(filename).read()
return prompt_string
def read_in_test_data_twocolumn(filename,sep=","):
"""
Reads in a two column version of the test data.
Filename must point to a delimited file.
In filename, the first column should be integer score data.
The second column should be string text data.
Sep specifies the type of separator between fields.
"""
score, text = [], []
combined_raw = open(filename).read()
raw_lines = combined_raw.splitlines()
for row in xrange(1, len(raw_lines)):
score1, text1 = raw_lines[row].strip().split("\t")
text.append(text1)
score.append(int(score1))
return score, text
def create_essay_set(text, score, prompt_string, generate_additional=True):
"""
Creates an essay set from given data.
Text should be a list of strings corresponding to essay text.
Score should be a list of scores where score[n] corresponds to text[n]
Prompt string is just a string containing the essay prompt.
Generate_additional indicates whether to generate additional essays at the minimum score point or not.
"""
x = EssaySet()
for i in xrange(0, len(text)):
x.add_essay(text[i], score[i])
if score[i] == min(score) and generate_additional == True:
x.generate_additional_essays(x._clean_text[len(x._clean_text) - 1], score[i])
x.update_prompt(prompt_string)
return x
def get_cv_error(clf,feats,scores):
"""
Gets cross validated error for a given classifier, set of features, and scores
clf - classifier
feats - features to feed into the classified and cross validate over
scores - scores associated with the features -- feature row 1 associates with score 1, etc.
"""
results={'success' : False, 'kappa' : 0, 'mae' : 0}
try:
cv_preds=util_functions.gen_cv_preds(clf,feats,scores)
err=numpy.mean(numpy.abs(numpy.array(cv_preds)-scores))
kappa=util_functions.quadratic_weighted_kappa(list(cv_preds),scores)
results['mae']=err
results['kappa']=kappa
results['success']=True
except ValueError:
#If this is hit, everything is fine. It is hard to explain why the error occurs, but it isn't a big deal.
log.exception("Not enough classes (0,1,etc) in each cross validation fold.")
except:
log.exception("Error getting cv error estimates.")
return results
def get_algorithms(type):
"""
Gets two classifiers for each type of algorithm, and returns them. First for predicting, second for cv error.
type - one of util_functions.AlgorithmTypes
"""
if type == util_functions.AlgorithmTypes.classification:
clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
else:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
clf2=sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learn_rate=.05,
max_depth=4, random_state=1,min_samples_leaf=3)
return clf, clf2
def extract_features_and_generate_model_predictors(predictor_set, type=util_functions.AlgorithmTypes.regression):
"""
Extracts features and generates predictors based on a given predictor set
predictor_set - a PredictorSet object that has been initialized with data
type - one of util_functions.AlgorithmType
"""
if(algorithm not in [util_functions.AlgorithmTypes.regression, util_functions.AlgorithmTypes.classification]):
algorithm = util_functions.AlgorithmTypes.regression
f = predictor_extractor.PredictorExtractor()
f.initialize_dictionaries(predictor_set)
train_feats = f.gen_feats(predictor_set)
clf,clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,predictor_set._target)
try:
set_score = numpy.asarray(predictor_set._target, dtype=numpy.int)
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def extract_features_and_generate_model(essays, type=util_functions.AlgorithmTypes.regression):
"""
Feed in an essay set to get feature vector and classifier
essays must be an essay set object
additional array is an optional argument that can specify
a numpy array of values to add in
returns a trained FeatureExtractor object and a trained classifier
"""
f = feature_extractor.FeatureExtractor()
f.initialize_dictionaries(essays)
train_feats = f.gen_feats(essays)
set_score = numpy.asarray(essays._score, dtype=numpy.int)
if len(util_functions.f7(list(set_score)))>5:
type = util_functions.AlgorithmTypes.regression
else:
type = util_functions.AlgorithmTypes.classification
clf,clf2 = get_algorithms(type)
cv_error_results=get_cv_error(clf2,train_feats,essays._score)
try:
clf.fit(train_feats, set_score)
except ValueError:
log.exception("Not enough classes (0,1,etc) in sample.")
set_score[0]=1
set_score[1]=0
clf.fit(train_feats, set_score)
return f, clf, cv_error_results
def dump_model_to_file(prompt_string, feature_ext, classifier, text, score, model_path):
"""
Writes out a model to a file.
prompt string is a string containing the prompt
feature_ext is a trained FeatureExtractor object
classifier is a trained classifier
model_path is the path of write out the model file to
"""
model_file = {'prompt': prompt_string, 'extractor': feature_ext, 'model': classifier, 'text' : text, 'score' : score}
pickle.dump(model_file, file=open(model_path, "w"))
def create_essay_set_and_dump_model(text,score,prompt,model_path,additional_array=None):
"""
Function that creates essay set, extracts features, and writes out model
See above functions for argument descriptions
"""
essay_set=create_essay_set(text_score,prompt)
feature_ext,clf=extract_features_and_generate_model(essay_set,additional_array)
dump_model_to_file(prompt,feature_ext,clf,model_path)
"""
Extracts features for an arbitrary set of textual and numeric inputs
"""
import numpy
import re
import nltk
import sys
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os
from itertools import chain
import copy
import operator
import logging
import math
from feature_extractor import FeatureExtractor
#Append to path and then import things that depend on path
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
from essay_set import EssaySet
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log = logging.getLogger(__name__)
class PredictorExtractor(object):
def __init__(self):
self._extractors = []
self._initialized = False
def initialize_dictionaries(self, p_set):
"""
Initialize dictionaries with the textual inputs in the PredictorSet object
p_set - PredictorSet object that has had data fed in
"""
success = False
if not (hasattr(p_set, '_type')):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
if not (p_set._type == "train"):
error_message = "needs to be an essay set of the train type."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
div_length=len(p_set._essay_sets)
if div_length==0:
div_length=1
#Ensures that even with a large amount of input textual features, training time stays reasonable
max_feats2 = int(math.floor(200/div_length))
for i in xrange(0,len(p_set._essay_sets)):
self._extractors.append(FeatureExtractor())
self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2)
self._initialized = True
success = True
return success
def gen_feats(self, p_set):
"""
Generates features based on an iput p_set
p_set - PredictorSet
"""
if self._initialized!=True:
error_message = "Dictionaries have not been initialized."
log.exception(error_message)
raise util_functions.InputError(p_set, error_message)
textual_features = []
for i in xrange(0,len(p_set._essay_sets)):
textual_features.append(self._extractors[i].gen_feats(p_set._essay_sets[i]))
textual_matrix = numpy.concatenate(textual_features, axis=1)
predictor_matrix = numpy.array(p_set._numeric_features)
print textual_matrix.shape
print predictor_matrix.shape
overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1)
return overall_matrix.copy()
import numpy
import nltk
import sys
import random
import os
import logging
import essay_set
base_path = os.path.dirname(__file__)
sys.path.append(base_path)
import util_functions
if not base_path.endswith("/"):
base_path=base_path+"/"
log=logging.getLogger(__name__)
class PredictorSet(object):
def __init__(self, type = "train"):
"""
Initialize variables and check essay set type
"""
if(type != "train" and type != "test"):
type = "train"
self._type = type
self._target=[]
self._textual_features=[]
self._numeric_features=[]
self._essay_sets=[]
def add_row(self, numeric_features, textual_features, target):
#Basic input checking
if not isinstance(target, (int, long, float)):
error_message = "Target is not a numeric value."
log.exception(error_message)
raise util_functions.InputError(target, error_message)
if not isinstance(numeric_features, list):
error_message = "Numeric features are not a list."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if not isinstance(textual_features, list):
error_message = "Textual features are not a list."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Do some length checking for parameters
if len(self._numeric_features)>0:
numeric_length = len(self._numeric_features[-1])
current_numeric_length = len(numeric_features)
if numeric_length != current_numeric_length:
error_message = "Numeric features are an improper length."
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
if len(self._textual_features)>0:
textual_length = len(self._textual_features[-1])
current_textual_length = len(textual_features)
if textual_length != current_textual_length:
error_message = "Textual features are an improper length."
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Now check to see if text features and numeric features are individually correct
for i in xrange(0,len(numeric_features)):
try:
numeric_features[i] = float(numeric_features[i])
except:
error_message = "Numeric feature {0} not numeric.".format(numeric_features[i])
log.exception(error_message)
raise util_functions.InputError(numeric_features, error_message)
for i in xrange(0,len(textual_features)):
try:
textual_features[i] = str(textual_features[i].encode('ascii', 'ignore'))
except:
error_message = "Textual feature {0} not string.".format(textual_features[i])
log.exception(error_message)
raise util_functions.InputError(textual_features, error_message)
#Create essay sets for textual features if needed
if len(self._textual_features)==0:
for i in xrange(0,len(textual_features)):
self._essay_sets.append(essay_set.EssaySet(type=self._type))
#Add numeric and textual features
self._numeric_features.append(numeric_features)
self._textual_features.append(textual_features)
#Add targets
self._target.append(target)
#Add textual features to essay sets
for i in xrange(0,len(textual_features)):
self._essay_sets[i].add_essay(textual_features[i], target)
boto==2.6.0
coverage==3.5.3
dogstatsd-python==0.2
lxml==3.0.1
mock==0.8.0
nltk==2.0.3
nose==1.2.1
scipy==0.11.0
path.py==3.0
pip
pylint==0.26.0
pytz==2012h
scikit-learn==0.12.1
sphinx
django-sphinx-autodoc
from setuptools import setup, find_packages
with open('requirements.txt') as f:
required = f.read().splitlines()
setup(
name = "machine-learning",
version = "0.1",
packages=['machine_learning', 'machine_learning.external_code', 'machine_learning.data', 'machine_learning.external_code.fisher'],
package_data = {
'': ['*.txt', '*.rst', '*.p'],
},
author = "Vik Paruchuri",
author_email = "vik@edx.org",
description = "Machine learning based automated text classification for essay scoring.",
license = "AGPL",
keywords = "ml machine learning nlp essay education",
url = "https://github.com/edx/machine-learning",
include_package_data = True,
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment