-
Y. Chali, "Topic detection of unrestricted texts : approaches and evaluations," Applied Artificial Intelligence, vol. 19, iss. 2, pp. 119-136, 2005.
@article{chali_topic_2005, title = {Topic detection of unrestricted texts : approaches and evaluations},
volume = {19},
doi = {10.1080/08839510590887441},
abstract = {Topic detection and tracking refers to automatic techniques for locating topically related cohesive paragraphs in a stream of text. Most documents are about more than one subject, but many Natural Language Processing {(NLP)} and Information Retrieval {(IR)} techniques implicitly assume documents have just one topic. Even in the presence of a single topic within a document, the document may address multiple subtopics and various aspects of the primary topic. Hence, dividing documents into topically coherent units and discovering their topic might have many uses. We describe new clues that account for the topic of grouping of contiguous portions of the text. Those clues are based on general lexical resources, which make them applicable to unrestricted texts, and can have many uses such as helping users find answers to general questions in an information search task, or in question/answering systems, or in text summarization. We devise an algorithm for identifying these clues, and we report on the performance of these clues, as well as the improvements suggested by our experiments.},
number = {2},
journal = {Applied Artificial Intelligence},
author = {Yllias Chali},
year = {2005},
keywords = {Analyse de contenu},
pages = {119--136},
annote = {{{\textless}p{\textgreater}chaliYllias2005.pdf{\textless}/p{\textgreater}}} },
-
B. Stein and S. M. zu Eissen, "Topic Identification : framework and application," in Proceedings of the 4th international conference on knowledge management (I-KNOW 04), Graz, Austria, 2004, pp. 353-360.
@inproceedings{stein_topic_2004, address = {Graz, Austria},
title = {Topic Identification : framework and application},
volume = {Journal of universal computer science},
url = {http://www.uni-weimar.de/medien/webis/publications/downloads/papers/stein_2004b.pdf},
abstract = {This paper is on topic identification, i. e., the construction of useful labels for sets of documents. Topic identification is essential in connection within categorizing search applications, where several sets of documents are delivered and an expressive description for each category must be constructed on the fly. The contributions of this paper are threefold. (1) It presents a framework to formally specify the topic identification problem along with its desired properties, (2) it introduces a classification scheme for topic identification algorithms and outlines the respective algorithm of the {AIsearch} meta search engine, (3) it proposes a hybrid approach to topic identification, which relies on classification knowledge of existing ontologies.},
booktitle = {Proceedings of the 4th international conference on knowledge management {(I-KNOW} 04)},
author = {Benno Stein and Sven Meyer zu Eissen},
month = jul, year = {2004},
keywords = {Analyse de contenu},
pages = {353--360},
annote = {{{\textless}p{\textgreater}steinBenno2004.pdf{\textless}/p{\textgreater}}
}
-
C. Desmarais and J. Moscarola, "Analyse de contenu et analyse lexicale, le cas d’une étude en management public," in Lexicometrica, 2004.
@inproceedings{desmarais_analyse_2004, series = {Question de méthode},
title = {Analyse de contenu et analyse lexicale, le cas d'une étude en management public},
volume = {numéro spécial},
url = {http://www.cavi.univ-paris3.fr/lexicometrica/thema/thema7/Texte-Moscarola.pdf},
abstract = {Les outils de traitement informatisés des données textuelles ouvrent de nouvelles perspectives pour les méthodes d'analyse des données qualitatives. Cependant l'utilisation de ces outils n'est guère stabilisée et leur validité pose question. Comment s'y prendre pour en tirer une connaissance, et comment garantir à celle-ci une légitimité scientifique ? Quelles méthodes privilégier parmi l'arsenal dont le chercheur dispose désormais ? Pour apporter des réponses à cet ensemble de questions, cet article analyse un processus de recherche ayant eu pour objectif d'appréhender l'évolution des rôles de l'encadrement des villes, en contexte de modernisation. Il expose plus particulièrement deux méthodes : l'analyse de contenu et l'analyse lexicale, dont les potentialités et la complémentarité sont développées},
booktitle = {Lexicometrica},
author = {Céline Desmarais and Jean Moscarola},
year = {2004},
keywords = {Analyse de contenu},
annote = {{{\textless}p{\textgreater}desmaraisCeline2004.pdf{\textless}/p{\textgreater}}} },
-
A. Moschitti and R. Basili, "Complex linguistic features for text classification : a comprehensive study," in Advances in information retrieval : 26th European Conference on IR Research, ECIR 2004, Sunderland, UK, April 5-7, 2004 : proceedings, Berlin ; New York, NY, 2004, pp. 181-196.
@inproceedings{moschitti_complex_2004, address = {Berlin ; New York, {NY}},
series = {Lecture notes in computer science; 2997},
title = {Complex linguistic features for text classification : a comprehensive study},
isbn = {1558602070},
url = {dit.unitn.it/~moschitt/articles/ECIR2004.pdf},
abstract = {Previous researches on advanced representations for document retrieval have shown that statistical state-of-the-art models are not improved by a variety of different linguistic representations. Phrases, word senses and syntactic relations derived by Natural Language Processing {(NLP)} techniques were observed ineffective to increase retrieval accuracy. For Text Categorization {(TC)} are available fewer and less definitive studies on the use of advanced document representations as it is a relatively new research area (compared to document retrieval). In this paper, advanced document representations have been investigated. Extensive experimentation on representative classifiers, Rocchio and {SVM,} as well as a careful analysis of the literature have been carried out to study how some {NLP} techniques used for indexing impact {TC.} Cross validation over 4 different corpora in two languages allowed us to gather an overwhelming evidence that complex nominals, proper nouns and word senses are not adequate to improve {TC} accuracy.},
booktitle = {Advances in information retrieval : 26th European Conference on {IR} Research, {ECIR} 2004, Sunderland, {UK,} April 5-7, 2004 : proceedings},
publisher = {Springer},
author = {Alessandro Moschitti and Roberto Basili},
year = {2004},
keywords = {Analyse de contenu, Approche probabiliste, Catégorisation, Classification, Indexation, Langage naturel},
pages = {181--196},
annote = {{{\textless}p{\textgreater}moschittiAlessandro2004.pdf{\textless}/p{\textgreater}}} },
-
R. Adaikkalavan, L. Elkhalifa, and A. Y. Aslandogan, "Topic identification through ontology-based concept generalization," University of Texas in Arlington, Texas, États-Unis, Project report CSE-2003-26, 2003.
@techreport{adaikkalavan_topic_2003, address = {Texas, {États-Unis}},
type = {Project report},
title = {Topic identification through ontology-based concept generalization},
url = {http://www.cse.uta.edu/Research/Publications/Downloads/CSE-2003-26.pdf},
abstract = {We present a method for topic identification of web pages based on contextual support and structural term weighting. For topic selection, concept expansion is performed through an ontology such as the {WordNet.} The experimental evaluation suggests that the approach is promising and can be adapted to many categorization tasks.},
number = {{CSE-2003-26}},
institution = {University of Texas in Arlington},
author = {Raman Adaikkalavan and Laali Elkhalifa and Y. Alp Aslandogan},
year = {2003},
keywords = {Analyse de contenu, Ontologie},
annote = {{{\textless}p{\textgreater}adaikkalavanRaman2003.pdf{\textless}/p{\textgreater}}} },
-
L. Fontaine and Y. Kodratoff, The role of thematic and concept texture in scientific text: comparing native and non-native writers of english, 2003.
@misc{fontaine_role_2003, title = {The role of thematic and concept texture in scientific text: comparing native and non-native writers of english},
url = {http://www.lri.fr/~yk/fon-kod-eng.pdf},
abstract = {This paper explores two types of text structure in scientific research articles written by native and non-native writers of English. We use a textlinguistic analysis to study thematic progression in these texts. We compare these results to the study of a new type of texture found in texts: that is, the texture formed by concept identification using scientific methods from Text Mining approaches. Our conclusions point out some of the specific difficulties that non-native writers face in managing the structure of their texts. We also look toward developing an automatic aide for non-native writers.},
author = {Lise Fontaine and Yves Kodratoff},
year = {2003},
keywords = {Analyse de contenu, Fouille de texte},
annote = {{{\textless}p{\textgreater}fontaineLise2003.pdf{\textless}/p{\textgreater}}} },
-
M. Aery, N. Remamurthy, and A. Y. Aslandogan, "Topic identification of textual data," University of Texas, Arlington, Texas, Project report CSE-2003-25, 2003.
@techreport{aery_topic_2003, address = {Arlington, Texas},
type = {Project report},
title = {Topic identification of textual data},
url = {http://www.cse.uta.edu/Research/Publications/Downloads/CSE-2003-25.pdf},
abstract = {In this work we study Unigram and {TF*IDF-based} methods for classifying web pages into categories drawn from popular web directories such as Google and Yahoo. Experimental evaluation reveals that the Unigram model out-performs the {TFIDF} Classifier on most counts, reasonably as it takes the probability measure to determine the topic categories against the word and inverse document frequency metric of the {TFDIF} classifier. We also observe the impact of the size of the training corpus on the categorization performance.},
number = {{CSE-2003-25}},
institution = {University of Texas},
author = {Manu Aery and Naveen Remamurthy and Y. Alp Aslandogan},
year = {2003},
keywords = {Analyse de contenu},
annote = {{{\textless}p{\textgreater}aeryManu2003.pdf{\textless}/p{\textgreater}}} },
-
R. Vinot, N. Grabar, and M. Valette, "Application d’algorithmes de classification automatique pour la détection des contenus racistes," in TALN, Batz-sur-Mer, France, 2003.
@inproceedings{vinot_application_2003, address = {{Batz-sur-Mer,} France},
title = {Application d’algorithmes de classification automatique pour la détection des contenus racistes},
url = {www.atala.org/doc/actes_taln/AC_0101.pdf},
abstract = {Le filtrage de contenus illicites sur Internet est une problématique difficile qui est actuellement résolue par des approches à base de listes noires et de mots-clés. Les systèmes de classifica- tion textuelle par apprentissage automatique nécessitant peu d’interventions humaines, elles peuvent avantageusement remplacer ou compléter les méthodes précédentes pour faciliter les mises à jour. Ces techniques, traditionnellement utilisées avec des catégories définies par leur sujet (économie ou sport par exemple), sont fondées sur la présence ou l’absence de mots. Nous présentons une évaluation de ces techniques pour le filtrage de contenus racistes. Contrairement aux cas traditionnels, les documents ne doivent pas être catégorisés suivant leur sujet mais sui- vant le point de vue énoncé (raciste ou antiraciste). Nos résultats montrent que les classifieurs, essentiellement lexicaux, sont néanmoins bien adaptées : plus de 90\% des documents sont cor- rectement classés, voir même 99\% si l’on accepte une classe de rejet (avec 20\% d’exemples non classés). Filtering of illicit contents on the Internet is a difficult issue which is currently solved with black lists and keywords. Machine-learning text categorization techniques needing little human inter- vention can replace or complete the previous methods to keep the filtering up-to-date easily. These echniques, usually used with topic classes (economy or sport for instance), are based on the presence or absence of words. We present an evaluation of these techniques for racism filte- ring. Unlike the traditional systems, documents are not categorized according to their main topic but according to the expressed point of view (racist or anti-racist). Our results show that these lexical techniques are well adapted : more than 90\% of the documents are correctly classified, or even 99\% if a rejection class is accepted (20\% of the examples are not classified).},
booktitle = {{TALN}},
author = {Romain Vinot and Natalia Grabar and Mathieu Valette},
month = jun, year = {2003},
keywords = {Analyse de contenu, Classification},
annote = {{{\textless}p{\textgreater}vinotRomain2003.pdf{\textless}/p{\textgreater}}} },
-
J. Burrows, "’Delta’ : a measure of stylistic difference and a guide to likely authorship," Literacy and Linguist Computing, vol. 17, iss. 3, pp. 267-287, 2002.
@article{burrows_delta_2002, title = {{'Delta'} : a measure of stylistic difference and a guide to likely authorship},
volume = {17},
url = {http://llc.oxfordjournals.org/cgi/reprint/17/3/267.pdf},
doi = {10.1093/llc/17.3.267},
abstract = {This paper is a companion to my Questions of authorship: attribution and beyond', in which I sketched a new way of using the relative frequencies of the very common words for comparing written texts and testing their likely authorship. The main emphasis of that paper was not on the new procedure but on the broader consequences of our increasing sophistication in making such comparisons and the increasing (although never absolute) reliability of our inferences about authorship. My present objects, accordingly, are to give a more complete account of the procedure itself; to report the outcome of an extensive set of trials; and to consider the strengths and limitations of the new procedure. The procedure offers a simple but comparatively accurate addition to our current methods of distinguishing the most likely author of texts exceeding about 1,500 words in length. It is of even greater value as a method of reducing the field of likely candidates for texts of as little as 100 words in length. Not unexpectedly, it works least well with texts of a genre uncharacteristic of their author and, in one case, with texts far separated in time across a long literary career. Its possible use for other classificatory tasks has not yet been investigated.},
number = {3},
journal = {Literacy and Linguist Computing},
author = {John Burrows},
month = sep, year = {2002},
keywords = {Analyse de contenu},
pages = {267--287},
annote = {{{\textless}p{\textgreater}burrowsJohn2002.pdf{\textless}/p{\textgreater}}} },
-
P. A. Fortier, M. Louwerse, and W. van Peer, "Prototype effect vs rarity effect in literary style." Amsterdam: John Benjamins, 2002, pp. 397-405.
@incollection{fortier_prototype_2002, address = {Amsterdam},
title = {Prototype effect vs rarity effect in literary style},
booktitle = {Thematics : interdisciplinary studies},
publisher = {John Benjamins},
author = {P. A. Fortier and Max Louwerse and Willie van Peer},
year = {2002},
keywords = {Analyse de contenu, Linguistique},
pages = {397--405} },
-
R. Hogenraad, M. Louwerse, and W. van Peer, "Moving targets : the making and molding of a theme." Amsterdam: John Benjamins, 2002, p. 353.
@incollection{hogenraad_moving_2002, address = {Amsterdam},
title = {Moving targets : the making and molding of a theme},
booktitle = {Thematics: Interdisciplinary studies},
publisher = {John Benjamins},
author = {Robert Hogenraad and Max Louwerse and Willie van Peer},
year = {2002},
keywords = {Analyse de contenu},
pages = {353–376} },
-
H. Larochelle, "Étude de la pertinence de métriques statistiques pour la détection de termes dans un document," PhD Thesis , 2002.
@phdthesis{larochelle_tude_2002, type = {Mémoire de stage du {CRSNG}},
title = {Étude de la pertinence de métriques statistiques pour la détection de termes dans un document},
url = {http://www.iro.umontreal.ca/~felipe/Memoires/hugo.pdf},
abstract = {L'extraction terminologique est une activité de spécialistes (ces spécialistes sont appelés des terminologues) et leur expertise est nécessaire dans de nombreux domaines, notamment la traduction. De nombreux outils d'aide à l'extraction terminologique ont été proposés ou sont disponibles sur le marché. La performance de ces logiciels n'est pas toujours faciles à estimer. Hugo a travaillé sur l'implantation et la comparaison des performances d'une large gamme de mesures statistiques proposées dans la littérature.},
school = {Université de Montréal},
author = {Hugo Larochelle},
year = {2002},
keywords = {Analyse de contenu, Approche statistique, Fouille de texte},
pages = {47 p.},
annote = {{{\textless}p{\textgreater}larochelleHugo2002.pdf{\textless}/p{\textgreater}}} },
-
T. S. Sae and K. T. Enya, "Annotating topic hierarchy based on the feature selection technique," , Innsbruck, Austria, 2001, pp. 265-9.
@inproceedings{saravadee_sae_annotating_2001, address = {Innsbruck, Austria},
series = {Proceedings of the {IASTED} International Conference Applied Informatics. International Symposium on Artificial Intelligence and Applications},
title = {Annotating topic hierarchy based on the feature selection technique},
abstract = {This paper proposes an approach of using feature selection technique to annotate a topic hierarchy. The fundamental idea behind this work is to select a set of keywords from the documents in a topic hierarchy to enrich the hierarchy's concept. This approach is developed and tested on one of the existing web hierarchy, Yahoo! Shopping hierarchy. Our experiments are performed using a feature selection algorithm that combines both methods for feature selection in machine learning and text learning, to select a set of keywords for each node in the topic hierarchy. We believe that this set of keywords can provide us with more information about the node's concept. An experimental evaluation on the real world data collected from the web show that our approach gives promising results and can potentially be used to annotate a web hierarchy},
publisher = {{ACTA} Press},
author = {Tan Saravadee Sae and Kong Tang Enya},
year = {2001},
note = {Copyright 2002, {IEE}},
keywords = {Analyse de contenu},
pages = {265--9},
annote = {{\textless}p{\textgreater}7314703 natural language processing feature selection topic hierarchy information retrieval Internet search process hierarchical classification machine learning text data{\textless}/p{\textgreater}} },
-
M. K. Sjöblom and &. Brunet, "La thématique. Essai de repérage automatique dans l’oeuvre d’un écrivain," , 2000.
@article{kastberg_sjblom_la_2000, series = {5es journées internationales d'analyse statistique des données textuelles},
title = {La thématique. Essai de repérage automatique dans l'oeuvre d'un écrivain},
author = {Margareta Kastberg Sjöblom and Étienne Brunet},
month = mar, year = {2000},
keywords = {Analyse de contenu, Fouille de texte} },
-
M. L. Ryan, Cyberespace textuality: computer technology and literary theory, Bloomington, In.: Indiana University Press, 1999.
@book{ryan_cyberespace_1999, address = {Bloomington, In.},
title = {Cyberespace textuality: computer technology and literary theory},
publisher = {Indiana University Press},
author = {M. L. Ryan},
year = {1999},
keywords = {Analyse de contenu, Informatique, Web} },
-
D. Merkl and A. Rauber, "Uncovering associations between documents," in In proceeding international joint conference on artificial intelligence (IJCAI99), 1999.
@inproceedings{merkl_uncovering_1999, title = {Uncovering associations between documents},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.36.3567&rep=rep1&type=pdf},
doi = {10.1.1.36.3567},
abstract = {The self-organizing map is a very popular unsupervised neural network model for the analysis of high-dimensional input data as it is typically found in information retrieval applications. However, the interpretation of the map requires much manual effort, especially as far as the analysis of the learned features and the characteristics of identified clusters is concerned. In this paper we present our novel {LabelSOM} method which, based on the features learned by the map, automatically selects the most descriptive features of the input patterns mapped onto a particular unit of the map, thus making the associations between the various clusters within the map explicit. We demonstrate the benefits of this approach with examples from text classification using two different real-world document archives. In this particular},
booktitle = {In proceeding international joint conference on artificial intelligence {(IJCAI99)}},
author = {Dieter Merkl and Andreas Rauber},
year = {1999},
keywords = {Analyse de contenu},
annote = {{{\textless}p{\textgreater}merkiDieter1999.pdf{\textless}/p{\textgreater}}} },
-
T. K. Landauer, P. W. Foltz, and D. Laham, "Introduction to latent semantic analysis," Discourse Processes, vol. 25, pp. 259-284, 1998.
@article{landauer_introduction_1998, title = {Introduction to latent semantic analysis},
volume = {25},
url = {http://lsa.colorado.edu/papers/dp1.LSAintro.pdf},
journal = {Discourse Processes},
author = {Thomas K. Landauer and Peter W. Foltz and Darrell Laham},
year = {1998},
keywords = {Analyse de contenu},
pages = {259--284},
annote = {{{\textless}p{\textgreater}landauerThomas1998.pdf{\textless}/p{\textgreater}}} },
-
E. Hovy and D. Radev, Intelligent text summarization. Papers from the 1998 AAAI Spring Symposium, Menlo Park, Calif.: AAAI Press, 1998.
@book{hovy_intelligent_1998, address = {Menlo Park, Calif.},
title = {Intelligent text summarization. Papers from the 1998 {AAAI} Spring Symposium},
publisher = {{AAAI} Press},
author = {Eduard Hovy and Dragomir Radev},
year = {1998},
keywords = {Analyse de contenu, Fouille de texte} },
-
A. D. Robert and A. Bouillaguet, L’analyse du contenu, Paris: Presses universitaires de France, 1997.
@book{robert_lanalyse_1997, address = {Paris},
series = {Que sais-je ?},
title = {L'analyse du contenu},
publisher = {Presses universitaires de France},
author = {André D. Robert and Annick Bouillaguet},
year = {1997},
keywords = {Analyse de contenu} },
-
A. Auger, Repérage des énoncés d’intérêt définitoire dans les bases de données textuelles, 1997.
@misc{auger_reprage_1997, title = {Repérage des énoncés d'intérêt définitoire dans les bases de données textuelles},
url = {http://doc.rero.ch/getfile.py?docid=2&name=these_AugerA&format=pdf&version=1},
abstract = {Notre époque est caractérisée par une inflation exponentielle de la documentation informatisée. Pour la lexicographie et la terminologie modernes, la multiplication des bases de données textuelles pose de nombreux défis. Comment, en effet, rédiger un article de dictionnaire qui puisse rendre compte de l'analyse sémantique d'un mot si ce dernier apparaît dans des dizaines de milliers de contextes ? Dans un ensemble documentaire, tous les énoncés dans lesquels un mot apparaît n'ont pas la même valeur lexicographique. Certains d'entre eux véhiculent des informations primordiales en vue de produire un article de dictionnaire. Ce sont des énoncés définitoires. Cet ouvrage présente l'ensemble des ressources linguistiques dont la langue française dispose pour produire des énoncés définitoires et les stratégies de fouille documentaire à utiliser en vue de repérer automatiquement ces énoncés dans les bases de données textuelles.},
author = {Alain Auger},
month = may, year = {1997},
keywords = {Analyse de contenu, Fouille de texte},
annote = {{{\textless}p{\textgreater}augerAlain1997.pdf{\textless}/p{\textgreater}}} },
-
J. Krause, C. Züll, J. Harkness, and J. H. P. Hoffmeyer-Zlotnick, "Principles of content analysis for information retrieval systems." Manheim: Zuma, 1996, pp. 77-100.
@incollection{krause_principles_1996, address = {Manheim},
series = {Zuma Nachrichten Special},
title = {Principles of content analysis for information retrieval systems},
booktitle = {Text analysis and computers},
publisher = {Zuma},
author = {J. Krause and C. Züll and J. Harkness and J. H. P. {Hoffmeyer-Zlotnick}},
year = {1996},
keywords = {Analyse de contenu, Recherche d'information},
pages = {77--100} },
-
&. Martin and F. Rastier, "Thème d’étude, étude de thème." Paris: Didier, 1995, pp. 13-24.
@incollection{martin_thme_1995, address = {Paris},
series = {Études de sémantique lexicale},
title = {Thème d'étude, étude de thème},
booktitle = {L'analyse thématique des données textuelles : l'exemple des sentiments},
publisher = {Didier},
author = {Éveline Martin and François Rastier},
year = {1995},
keywords = {Analyse de contenu},
pages = {13--24} },
-
U. Eco, Les limites de l’interprétation, Paris: Grasset, 1992.
@book{eco_les_1992, address = {Paris},
title = {Les limites de l'interprétation},
publisher = {Grasset},
author = {Umberto Eco},
year = {1992},
keywords = {Analyse de contenu} },
-
M. Collot, "Le thème selon la critique thématique," Communications of the ACM, iss. 47, pp. 79-90, 1988.
@article{collot_le_1988, title = {Le thème selon la critique thématique},
number = {47},
journal = {Communications of the {ACM}},
author = {Michel Collot},
year = {1988},
keywords = {Analyse de contenu},
pages = {79--90} },
-
G. Prince, "Le thème du récit," Communications, iss. 47, pp. 199-208, 1988.
@article{prince_le_1988, title = {Le thème du récit},
number = {47},
journal = {Communications},
author = {Gerald Prince},
year = {1988},
keywords = {Analyse de contenu},
pages = {199--208} },
-
S. Rimmon-Kenan, "Qu’est-ce qu’un thème," Poétique, vol. 64, pp. 397-406, 1985.
@article{rimmon-kenan_quest-ce_1985, title = {Qu'est-ce qu'un thème},
volume = {64},
journal = {Poétique},
author = {Shlomith {Rimmon-Kenan}},
year = {1985},
keywords = {Analyse de contenu},
pages = {397--406} },
-
A. Gull and F. Sebastiani, "Automatic Web page categorization by link and context analysis."
@article{gull_automatic_????, title = {Automatic Web page categorization by link and context analysis},
url = {nmis.isti.cnr.it/sebastiani/Publications/THAI99.pdf},
doi = {10.1.1.29.8904},
abstract = {Assistance in retrieving documents on the World Wide Web is provided either by search engines, through keyword-based queries, or by catalogues, which organize documents into hierarchical collections. Maintaining catalogues manually is becoming increasingly difficult, due to the sheer amount of material on the Web; it is thus becoming necessary to resort to techniques for the automatic classification of documents. Automatic classification is traditionally performed by extracting the information for representing a document (\"indexing\") from the document itself. The paper describes the novel technique of categorization by context, which instead extracts useful information for classifying a document from the context where a {URL} referring to it appears. We present the results of experimenting with Theseus, a classifier that exploits this technique.},
author = {Antonio Gull and Fabrizio Sebastiani},
keywords = {Analyse de contenu, Catégorisation},
annote = {{{\textless}p{\textgreater}gullAntonio.pdf{\textless}/p{\textgreater}}} },
-
H. Anaya-Sánchez, R. Berlanga-Llavori, and A. Pons-Porrata, "Retrieval of relevant concepts from a text collection," in Current topics in artificial intelligence : 12th conference of the Spanish association for artificial intelligence, CAEPIA 2007, Salamanca, Spain, november 12-16, 2007 : selected papers, Berlin; Heidelberg, 2007, pp. 21-30.
@inproceedings{anaya-snchez_retrieval_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4788},
title = {Retrieval of relevant concepts from a text collection},
url = {http://dx.doi.org/10.1007/978-3-540-75271-4_3},
abstract = {This paper addresses the characterization of a large text collection by introducing a method for retrieving sets of relevant {WordNet} concepts as descriptors of the collection contents. The method combines models for identifying interesting word co-occurrences with an extension of a word sense disambiguation algorithm in order to retrieve the concepts that better fit in with the collection topics. Multi-word nominal concepts that do not explicitly appear in the texts, can be found among the retrieved concepts. We evaluate our proposal using extensions of recall and precision that are also introduced in this paper.},
booktitle = {Current topics in artificial intelligence : 12th conference of the Spanish association for artificial intelligence, {CAEPIA} 2007, Salamanca, Spain, november 12-16, 2007 : selected papers},
publisher = {Springer},
author = {Henry {Anaya-Sánchez} and Rafael {Berlanga-Llavori} and Aurora {Pons-Porrata}},
year = {2007},
keywords = {Analyse de corpus, Fouille de texte, Recherche d'information},
pages = {21--30} },
-
Y. Li, D. McLean, Z.A., J. D. . O’Shea, and K. Crockett, "Sentence similarity based on semantic nets and corpus statistics," Knowledge and Data Engineering, IEEE Transactions on, vol. 18, iss. 8, pp. 1138-1150, 2006.
@article{li_sentence_2006, title = {Sentence similarity based on semantic nets and corpus statistics},
volume = {18},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/33191/01563981.pdf?tp=&isnumber=33191&arnumber=1563981&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2006.19},
abstract = {Sentence similarity measures play an increasingly important role in text-related research and applications in areas such as text mining, Web page retrieval, and dialogue systems. Existing methods for computing sentence similarity have been adopted from approaches used for long text documents. These methods process sentences in a very high-dimensional space and are consequently inefficient, require human input, and are not adaptable to some application domains. This paper focuses directly on computing the similarity between very short texts of sentence length. It presents an algorithm that takes account of semantic information and word order information implied in the sentences. The semantic similarity of two sentences is calculated using information from a structured lexical database and from corpus statistics. The use of a lexical database enables our method to model human common sense knowledge and the incorporation of corpus statistics allows our method to be adaptable to different domains. The proposed method can be used in a variety of applications that involve text knowledge representation and discovery. Experiments on two sets of selected sentence pairs demonstrate that the proposed method provides a similarity measure that shows a significant correlation to human intuition.},
number = {8},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Y. Li and D. {McLean} and {Z.A.} Bandar and {J.D.} {O'Shea} and K. Crockett},
year = {2006},
keywords = {Analyse de corpus},
pages = {1138--1150},
annote = {{{\textless}p{\textgreater}liYuhua2006.pdf{\textless}/p{\textgreater}}} },
-
P. Cimiano, S. Staab, and A. Hotho, "Learning concept hierarchies from text corpora using formal concept analysis," Journal of Artificial Intelligence Research, vol. 24, pp. 305-339, 2005.
@article{cimiano_learning_2005, title = {Learning concept hierarchies from text corpora using formal concept analysis},
volume = {24},
doi = {10.1.1.60.228},
abstract = {We present a novel approach to the automatic acquisition of taxonomies or concept hierarchies from a text corpus. The approach is based on Formal Concept Analysis {(FCA),} a method mainly used for the analysis of data, i.e. for investigating and processing explicitly given information. We follow Harris ’ distributional hypothesis and model the context of a certain term as a vector representing syntactic dependencies which are automatically acquired from the text corpus with a linguistic parser. On the basis of this context information, {FCA} produces a lattice that we convert into a special kind of partial order constituting a concept hierarchy. The approach is evaluated by comparing the resulting concept hierarchies with hand-crafted taxonomies for two domains: tourism and finance. We also directly compare our approach with hierarchical agglomerative clustering as well as with {Bi-Section-KMeans} as an instance of a divisive clustering algorithm. Furthermore, we investigate the impact of using different measures weighting the contribution of each attribute as well as of applying a particular smoothing technique to cope with data sparseness.},
journal = {Journal of Artificial Intelligence Research},
author = {Philipp Cimiano and Steffen Staab and Andreas Hotho},
year = {2005},
keywords = {Analyse de corpus},
pages = {305--339},
annote = {{{\textless}p{\textgreater}cimianoPhilipp2005.pdf{\textless}/p{\textgreater}{\textless}p{\textgreater}\ {\textless}/p{\textgreater}}} },
-
T. Tao and ChengXiang, "Mining comparable bilingual text corpora for cross-language information integration," , Chicago, Illinois, USA, 2005, pp. 691-696.
@inproceedings{tao_mining_2005, address = {Chicago, Illinois, {USA}},
title = {Mining comparable bilingual text corpora for cross-language information integration},
isbn = {{1-59593-135-X}},
url = {http://portal.acm.org/ft_gateway.cfm?id=1081958&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1081870.1081958},
abstract = {Integrating information in multiple natural languages is a challenging task that often requires manually created linguistic resources such as a bilingual dictionary or examples of direct translations of text. In this paper, we propose a general cross-lingual text mining method that does not rely on any of these resources, but can exploit comparable bilingual text corpora to discover mappings between words and documents in different languages. Comparable text corpora are collections of text documents in different languages that are about similar topics; such text corpora are often naturally available (e.g., news articles in different languages published in the same time period). The main idea of our method is to exploit frequency correlations of words in different languages in the comparable corpora and discover mappings between words in different languages. Such mappings can then be used to further discover mappings between documents in different languages, achieving cross-lingual information integration. Evaluation of the proposed method on a {120MB} {Chinese-English} comparable news collection shows that the proposed method is effective for mapping words and documents in English and Chinese. Since our method only relies on naturally available comparable corpora, it is generally applicable to any language pairs as long as we have comparable corpora.},
publisher = {{ACM}},
author = {Tao Tao and {ChengXiang} Zhai},
year = {2005},
keywords = {Analyse de corpus, Fouille de texte},
pages = {691--696},
annote = {{{\textless}p{\textgreater}taoTao2005.pdf{\textless}/p{\textgreater}}} },
-
C. Bodson, "Termes et relations sémantiques en corpus spécialisés : rapports entre patrons de relations sémantiques (PRS) et types sémantiques (TS)," PhD Thesis , 2005.
@phdthesis{bodson_termes_2005, type = {Thèse {(M.} A.)},
title = {Termes et relations sémantiques en corpus spécialisés : rapports entre patrons de relations sémantiques {(PRS)} et types sémantiques {(TS)}},
url = {http://www.olst.umontreal.ca/pdf/bodson2005.pdf},
abstract = {Les travaux entrepris dans le cadre de la présente thèse se situent dans le domaine de l’extraction semi-automatique d’informations sémantiques. Nous présentons un modèle dont l’objectif est de faciliter les recherches dans les corpus spécialisés. Nous cherchons à répondre aux besoins de la terminologie en repérant dans les textes des éléments utiles à la formulation de définitions. Notre objectif principal consiste à élaborer un modèle d’association des patrons de relations sémantiques {(PRS)} aux types sémantiques {(TS)} auxquels appartiennent les termes. Les travaux antérieurs ont montré l’intérêt général des {PRS} pour la terminologie. L’association de patrons aux {TS} permettrait des requêtes encore plus précises dans les corpus spécialisés. Nous postulons que les {PRS} peuvent être associés à des {TS} spécifiques indépendamment des domaines auxquels se rattachent les termes. Pour le démontrer, nous avons utilisé deux corpus; l’un porte sur la médecine et l’autre sur l’informatique. Ils sont composés de textes didactiques et de vulgarisation français : ces genres textuels sont le plus susceptibles de comporter des contextes définitoires. Nous avons étudié trente-trois termes associés à près de 710 contextes contenant un {PRS.} Nous nous sommes penchée sur les relations sémantiques d’hyperonymie, de méronymie, de finalité et de causalité ainsi que sur les patrons métalinguistiques. Les termes choisis appartiennent à des types sémantiques différents et leur classement s’inspire de la hiérarchie des noms de {WordNet.} Cette recherche montre qu’il est possible de mettre en rapport des relations sémantiques à des {TS} et ce, indépendamment des deux domaines étudiés. Cependant, elle montre également que la plupart des {PRS} sont spécifiques à un corpus.},
school = {Université de Montréal},
author = {Claudine Bodson},
year = {2005},
keywords = {Analyse de corpus, Linguistique},
pages = {119 f.},
annote = {{{\textless}p{\textgreater}bodsonClaudine2005.pdf{\textless}/p{\textgreater}{\textless}p{\textgreater}\ {\textless}/p{\textgreater}{\textless}p{\textgreater}\"Mémoire} présenté à la Faculté des études supérieures en vue de l\'obtention du grade de Maître ès arts {(M.} A.) en traduction option recherche\"{\textless}/p{\textgreater}} },
-
F. Neri and R. Raffaelli, "Text mining applied to multilingual corpora," in Knowledge mining : proceedings of the NEMIS 2004 final conference, Berlin; Heidelberg, 2005, pp. 123-231.
@inproceedings{neri_text_2005, address = {Berlin; Heidelberg},
series = {Studies in fuzziness and soft computing},
title = {Text mining applied to multilingual corpora},
url = {http://dx.doi.org/10.1007/3-540-32394-5_9},
abstract = {Up to 80\% of electronic data is textual and most valuable information is often encoded in pages which are neither structured, nor classified. Documents are — and will be — written in various native languages, but these documents are relevant even to non-native speakers. Nowadays everyone experiences a mounting frustration in the attempt of finding the information of interest, wading through thousands of pieces of data. The process of accessing all these raw data, heterogeneous for language used, and transforming them into information is therefore inextricably linked to the concepts of textual analysis and synthesis, hinging greatly on the ability to master the problems of multilinguality. Through Multilingual Text Mining, users can get an overview of great volumes of textual data having a highly readable grid, which helps them discover meaningful similarities among documents and find all related information. This paper describes the approach used by {SYNTHEMA} for Multilingual Text Mining, showing the classification results on around 600 breaking news written in English, Italian and French.},
booktitle = {Knowledge mining : proceedings of the {NEMIS} 2004 final conference},
publisher = {Springer},
author = {Federico Neri and Remo Raffaelli},
year = {2005},
keywords = {Analyse de corpus, Fouille de texte},
pages = {123--231} },
-
A. Condamines, Sémantique et corpus, Paris: Hermès science publications, 2005.
@book{condamines_smantique_2005, address = {Paris},
title = {Sémantique et corpus},
abstract = {La question du sens examinée à travers l'analyse de textes n'est pas nouvelle : depuis des siècles, des disciplines s'intéressent à cette question : la philosophie, la philologie, l'analyse comparative, et plus récemment, la sociolinguistique, l'ethnolinguistique. La nouveauté vient de ce que le développement de l'informatique a mis à disposition un grand nombre de textes via l'internet et d'outils pour les interroger. Cet ouvrage cherche à comprendre en quoi ce développement majeur permet un nouvel éclairage sur le problème de l'étude du sens dans les textes. La nécessaire clôture du corpus (ensemble de textes rassemblés pour une étude déterminée) crée un effet de limite et de confrontation à la réalité qui oblige à une réflexion sur les connaissances mises en œuvre lors de l'analyse. Les régularités mises au jour lors d'une analyse peuvent-elles être la base de l'élaboration d'un système de règles ? Comment les contextes de production et d'interprétation des textes peuvent-ils être pris en compte lors de l'analyse ? La notion de genre textuel peut-elle être une possibilité pour généraliser les résultats d'analyse ? En quoi les méthodes d'analyse (outils de {TAL,} méthodes quantitatives…) viennent-elles multiplier les points de vue sur un corpus et en quoi risquent-elles de biaiser les observations ? L'objectif de l'étude : analyse discursive, analyse littéraire, constitution de ressources en {TAL,} ingénierie des connaissances… a-t-il une influence déterminante sur les choix de catégorisation des phénomènes observés ? Ce sont quelques-unes des questions auxquelles des chercheurs d'horizons divers tentent de donner des réponses. Table des matières : Introduction : Sémantique et corpus, quelles rencontres possibles ? {-A.} Condamines. Les aspects dynamiques de la composition sémantique de l'oral {-C.} {Blanche-Benveniste.} L'analyse de corpus en linguistique interactionnelle : de l'étude de cas singuliers à l'étude de collections {-L.} Mondada. Utilisation de corpus pour l'évaluation d'hypothèses linguistiques : étude de autrement {-B.} Lamiroy, M. Charolles. Constitution et exploitation d'un corpus de français médiéval : enjeux, spécificités et apports {-S.} Prévost. Discours, corpus, traitements automatiques {-M.-P.} {Pery-Woodley.} Sur quelle sémantique reposent les méthodes automatiques d'accès au contenu textuel ? {-A.} Nazarenko. Corpus et sémantique discursive : éléments de méthode pour la lecture des corpus {-J.-M.} Viprey. Des décalagesde distribution aux divergences d'acception {-B.} Habert, G. Illouz, H. Folch. Corpus et connaissances : de l'extraction linguistique à la modélisation conceptuelle {-B.} Bachimont.},
publisher = {Hermès science publications},
author = {Anne Condamines},
year = {2005},
keywords = {Analyse de corpus, Linguistique} },
-
L. Gillam, M. Tariq, and K. Ahmad, "Terminology and the construction of ontology," Terminology, vol. 11, iss. 1, pp. 55-81, 2005.
@article{gillam_terminology_2005, title = {Terminology and the construction of ontology},
volume = {11},
issn = {09299971},
abstract = {This paper discusses a method for corpus-driven ontology design: extracting conceptual hierarchies from arbitrary domain-specific collections of texts. These hierarchies can form the basis for a concept-oriented (onomasiological) terminology collection, and hence may be used as the basis for developing knowledge-based systems using ontology editors. This reference to ontology is explored in the context of collections of terms. The method presented is a hybrid of statistical and linguistic techniques, employing statistical techniques initially to elicit a conceptual hierarchy, which is then augmented through linguistic analysis. The result of such an extraction may be useful in information retrieval, knowledge management, or in the discipline of terminology science itself.},
number = {1},
journal = {Terminology},
author = {Lee Gillam and Mariam Tariq and Khurshid Ahmad},
year = {2005},
keywords = {Analyse de corpus, Langage naturel, Méthodologie, Ontologie},
pages = {55--81},
annote = {{{\textless}p{\textgreater}gillamLee2005.pdf{\textless}/p{\textgreater}}} },
-
E. Savia, S. Kaski, V. Tuulos, and P. Myllym, "On text-based estimation of document relevance." 2004, p. 3275.
@inproceedings{savia_text-based_2004, title = {On text-based estimation of document relevance},
url = {http://citeseer.ist.psu.edu/article/savia04textbased.html},
abstract = {This work is part of a proactive information retrieval project that aims at estimating relevance from implicit user feedback. The noisy feedback signal needs to be complemented with all available information, and textual content is one of the natural sources. Here we take the first steps by investigating whether this source is at all useful in the challenging setting of estimating the relevance of a new document based on only few samples with known relevance. It turns out that even sophisticated unsupervised methods like multinomial {PCA} (or Latent Dirichlet Allocation) cannot help much. By contrast, feature extraction supervised by relevant auxiliary data may help.},
author = {E. Savia and S. Kaski and V. Tuulos and P. Myllym},
year = {2004},
keywords = {Analyse de corpus, Classification, Fouille de texte},
pages = {3275---3280} },
-
T. Roy, P. Beust, G. Purnelle, C. Fairon, and A. Dister, "ProxiDocs : un outil de cartographie et de catégorisation thématique de corpus," , Louvain-la-Neuve, 2004, pp. 978-986.
@inproceedings{roy_proxidocs_2004, address = {{Louvain-la-Neuve}},
title = {{ProxiDocs} : un outil de cartographie et de catégorisation thématique de corpus},
volume = {2},
abstract = {This paper presents a software, called {ProxiDocs,} which constructs a representation (a map) of the thematic structure of a whole of textual documents. {ProxiDocs} allows its users to realize a thematic analysis of a corpora according to his needs. These maps show thematic similarities and differences between documents which belong to a same corpora. The hypothesis tested with {ProxiDocs} affects the instrumentation of the meaning’s thematic dimension with statistical processing, like principal components analysis and hierarchical clustering. Cet article présente une application dédiée à la visualisation de propriétés thématiques d’un ensemble de documents électroniques. Cette application, nommée {ProxiDocs,} a pour but d’assister son utilisateur dans des tâches de veille technologique en lui donnant les moyens d’une analyse thématique de corpus par rapport à ses propres centres d’intérêts. {ProxiDocs} produit des cartes interactives mettant en évidence les proximités et les différences thématiques des documents composant le corpus donné, ainsi que la répartition des différents thèmes utilisés dans ces documents. Les hypothèses testées dans le cadre de {ProxiDocs} concernent donc l’instrumentation de la dimension thématique du sens par des traitements statistiques, tels l’analyse en composantes principales et la catégorisation hiérarchique ascendante.},
publisher = {Presses universitaires de Louvain},
author = {Thibault Roy and Pierre Beust and G. Purnelle and C. Fairon and A. Dister},
month = mar, year = {2004},
keywords = {Analyse de corpus},
pages = {978--986} },
-
V. Claveau and . L’Homme, "Discovering specific semantic relationships between nouns and verbs in a specilized french corpus," in Computerm 2004, Genève, Suisse, 2004, pp. 39-46.
@inproceedings{claveau_discovering_2004, address = {Genève, Suisse},
title = {Discovering specific semantic relationships between nouns and verbs in a specilized french corpus},
url = {http://www.google.ca/search?q=apprentissage+automatique+et+recherche+de+l'information&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:fr:official&client=firefox-a},
abstract = {Recent literature in computational terminology has shown an increasing interest in identifying various semantic relationships between terms. In this paper, we propose an original strategy to find specific noun-verb combinations in a specialized corpus. We focus on verbs that convey a meaning of realization. To acquire these noun-verb pairs, we use {ASARES,} a machine learning technique that automatically infers extraction patterns from examples and counter-examples of realization noun-verb pairs. The patterns are then applied to the corpus to retrieve new pairs. Results, measured with a large test set, show that our acquisition technique outperforms classical statistical methods used for collocation acquisition. Moreover, the inferred patterns yield interesting clues on which structures are more likely to convey the target semantic link.},
booktitle = {Computerm 2004},
author = {Vincent Claveau and {Marie-Claude} {L'Homme}},
year = {2004},
keywords = {Analyse de corpus, Fouille de texte},
pages = {39--46},
annote = {{{\textless}p{\textgreater}claveauVincent2004.pdf{\textless}/p{\textgreater}}} },
-
P. Drouin, "Detection of domain specific terminology using corpora comparison," in Proceedings of the fourth international conference on language resources and evaluation (LREC), 2004.
@inproceedings{drouin_detection_2004, title = {Detection of domain specific terminology using corpora comparison},
url = {http://www.olst.umontreal.ca/pdf/LREC_2004.pdf},
abstract = {Le présent article décrit une technique d’acquisition automatique des termes reposant sur les spécificités lexicales des corpus techniques. Plus spécifiquement, nous nous intéressons à l’acquisition automatique des termes simples en langue anglaise et au gain en précision réalisé grâce à la méthodologie proposée. Nous donnons tout d’abord une description de la méthodologie utilisée pour l’acquisition des spécificités lexicales des corpus. Par la suite, nous nous proposons une stratégie d’acquisition automatique de termes qui exploite ces spécificités. Enfin, nous présentons les corpus utilisés dans le cadre de notre démarche ainsi que les résultats obtenus.},
booktitle = {Proceedings of the fourth international conference on language resources and evaluation {(LREC)}},
author = {Patrick Drouin},
year = {2004},
keywords = {Analyse de corpus, Terminologie},
annote = {{{\textless}p{\textgreater}drouinPatrick2004.pdf{\textless}/p{\textgreater}}} },
-
M. Rossignol and P. Sébillot, "Extraction statistique sur corpus de classes de mots-clés thématiques," Traitement automatique des langues, vol. 44, iss. 33, pp. 217-246, 2003.
-
C. Lemay, "Identification automatique du vocabulaire caractéristique de l’informatique fondée sur la comparaison de corpus," PhD Thesis , 2003.
@phdthesis{lemay_identification_2003, type = {Mémoire de maîtrise, Département de linguistique et de traduction},
title = {Identification automatique du vocabulaire caractéristique de l’informatique fondée sur la comparaison de corpus},
url = {http://www.olst.umontreal.ca/pdf/memoirelemay.pdf},
abstract = {Notre travail s’inscrit dans le cadre de projets relevant du domaine du traitement automatique des langues, plus précisément de la terminologie computationnelle. Nous présentons deux méthodes visant à isoler de façon automatique le vocabulaire caractéristique du domaine de l’informatique. Ce vocabulaire englobe un vocabulaire fonctionnel et un vocabulaire spécialisé. La plupart des travaux portant sur l’extraction d’unités lexicales à partir de corpus s’attachent à extraire la terminologie, plus spécifiquement les termes complexes du corpus. Notre travail a ceci de particulier qu’il s’intéresse à l’extraction d’unités lexicales simples, lesquelles ne sont pas nécessairement des termes, mais des unités pouvant appartenir à toutes les parties du discours. Afin d’isoler notre vocabulaire, deux approches sont mises à contribution. Dans la première, nous opposons un corpus de référence, soit le corpus journalistique Le Monde, et un corpus d’analyse, soit un corpus composé de textes traitant de l’informatique. Dans la deuxième approche, nous opposons un corpus de référence, soit un corpus de textes sur l’informatique, et six sous-corpus d’analyse, chacun étant un sous-ensemble du corpus de référence et chacun représentant un sous-domaine de l’informatique. La comparaison des corpus repose sur la technique des spécificités et sur l’application de modèles statistiques, lesquels sont intégrés à un logiciel d’acquisition automatique des termes : {TermoStat.} Dans le but de valider le vocabulaire issu de nos deux méthodes de comparaison de corpus, nous avons recours à deux dictionnaires spécialisés de l’informatique. Les résultats seront principalement appliqués à la construction d’un dictionnaire de base de l’informatique. Par ailleurs, ce travail nous permettra de proposer de nouvelles perspectives pour des outils d’extraction terminologique. Mots clés : terminologie, terminologie computationnelle, comparaison de corpus, vocabulaire caractéristique, vocabulaire de base, spécificités, spécificités positives, statistique textuelle, langue de spécialité},
school = {Université de Montréal},
author = {Chantal Lemay},
year = {2003},
keywords = {Analyse de corpus, Informatique},
annote = {{{\textless}p{\textgreater}lemayChantal2003.pdf{\textless}/p{\textgreater}}
}
-
S. Patwardhan, "Incorporating dictionary and corpus information into a context vector measure of semantic relatedness," PhD Thesis , 2003.
@phdthesis{patwardhan_incorporating_2003, type = {Thèse {(M.A.)}},
title = {Incorporating dictionary and corpus information into a context vector measure of semantic relatedness},
school = {University of Minnesota},
author = {Siddharth Patwardhan},
year = {2003},
keywords = {Analyse de corpus, Fouille de texte},
annote = {{{\textless}p{\textgreater}patwardhanSiddharth2003.pdf{\textless}/p{\textgreater}}} },
-
Chung-Hong and Hsin-Chang, "Text mining of multilingual corpora via computing semantic relatedness," , Yasmine Hammamet, Tunisia, 2002, pp. 46-50.
@inproceedings{lee_text_2002, address = {Yasmine Hammamet, Tunisia},
series = {Proceedings of the {IEEE} International Conference on Systems, Man and Cybernetics},
title = {Text mining of multilingual corpora via computing semantic relatedness},
volume = {5},
abstract = {This paper describes a new application of a text-mining algorithm to the text sources of bilingual corpora. In the past, the majority of the approaches applied to measuring semantic relatedness was based on edge counting methods through a semantic network, such as {WordNet} It is not well suited for applications in specific domains in which the standard lexical knowledge bases are not available. In this work, we propose an alternative solution for acquisition of semantic relatedness from text corpora by means of a machine learning technique, namely the self-organizing maps. This paper presents a hybrid approach to discovering a concept-based feature map containing word clusters and document clusters from multilingual text collections. Using {SOM-based} automatic clustering techniques, we have conducted several experiments to uncover associated documents based on {Chinese-English} bilingual parallel corpora, and a hybrid {Chinese-English} corpus. In essence, this work provides a method for automatic text clustering, which resolves some of the language difficulties in concept discovery and categorization from multilingual text corpora.},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
author = {{Chung-Hong} Lee and {Hsin-Chang} Yang},
year = {2002},
keywords = {Analyse de corpus, Fouille de texte},
pages = {46--50},
annote = {{\textless}p{\textgreater}0884-3627{\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 03277530026 Text clustering{\textless}/p{\textgreater}} },
-
X. Hu and E. Atwell, A survey of machine learning approaches to analysis of large corpora, 2002.
@misc{hu_survey_2002, title = {A survey of machine learning approaches to analysis of large corpora},
url = {http://citeseer.ist.psu.edu/565578.html},
abstract = {Corpus-based Machine Learning of linguistic annotations has been a key topic for all areas of Natural Language Processing. This paper presents a survey, along three dimensions of classification. First we outline different linguistic level of analysis: Tokenisation, {Part-of-Speech} tagging, Parsing, Semantic analysis and Discourse annotation. Secondly, we introduce alternative approaches to Machine Learning applicable to linguistic annotation of corpora: N-gram and Markov models, Neural ...},
author = {Xunlei Hu and Eric Atwell},
year = {2002},
keywords = {Analyse de corpus, Apprentissage machine},
annote = {{{\textless}p{\textgreater}huXunlei2002.pdf{\textless}/p{\textgreater}}} },
-
S. A. Caraballo, "Automatic construction of a hypernym-labeled noun hierarchy from text," PhD Thesis , 2001.
@phdthesis{caraballo_automatic_2001, type = {Thèse (de doctorat)},
title = {Automatic construction of a hypernym-labeled noun hierarchy from text},
school = {Brown University},
author = {Sharon Ann Caraballo},
year = {2001},
keywords = {Analyse de corpus},
pages = {52 f.},
annote = {{{\textless}p{\textgreater}caraballoSharon2001.pdf{\textless}/p{\textgreater}}} },
-
N. Aussenac-Gilles, B. Biebow, and S. Szulman, "Revisiting ontology design : a method based on corpus analysis," in Knowledge engineering and knowledge management methods, models, and tools : 12th international conference, EKAW 2000 Juan-les-Pins, France, october 2–6, 2000 : proceedings, Berlin; Heidelberg, 2000, pp. 27-66.
@inproceedings{aussenac-gilles_revisiting_2000, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 1937. Lecture notes in artificial intelligence},
title = {Revisiting ontology design : a method based on corpus analysis},
isbn = {0302-9743},
abstract = {We promote a new approach for knowledge modelling based on knowledge elicitation from technical documents. It benefits of the increasing amount of available electronic texts and of the maturity of natural language processing tools. The approach defines a framework where the knowledge engineer selects the appropriate tools, combines their use and interprets their results to build up a domain model. The paper presents the method and reports an on-going application to design an ontology of knowledge engineering tools in French.},
booktitle = {Knowledge engineering and knowledge management methods, models, and tools : 12th international conference, {EKAW} 2000 {Juan-les-Pins,} France, october 2–6, 2000 : proceedings},
publisher = {Springer},
author = {N. {Aussenac-Gilles} and B. Biebow and S. Szulman},
year = {2000},
keywords = {Analyse de corpus, Design, Ontologie},
pages = {27--66} },
-
H. Ragas and C. H. A. Koster, "Four text classification algorithms compared on a dutch corpus," in Proceedings of the 21st annual international ACM SIGIR conference on Research and development in information retrieval, Melbourne, Australia, 1998, pp. 369-370.
@inproceedings{ragas_four_1998, address = {Melbourne, Australia},
title = {Four text classification algorithms compared on a dutch corpus},
isbn = {1-58113-015-5},
url = {http://portal.acm.org/ft_gateway.cfm?id=291059&type=pdf&coll=GUIDE&dl=GUIDE&CFID=2704018&CFTOKEN=92073516},
doi = {10.1145/290941.291059},
abstract = {We describe an experiment in applying text classification algorithms to Dutch texts. Four well-known learning algorithms, Rocchio's algorithm, the Simple Bayesian Classifier {(SBC),} the Sleeping Experts {(SE)} and Winnow were implemented. They were tested on a corpus of articles from the Dutch newspaper {NRC,} pre-classified into four categories. The algorithms are compared on learning speed and error rate. We also investigated the effect of discarding terms, using either a dynamic stoplist or the Winnow heuristic.},
booktitle = {Proceedings of the 21st annual international {ACM} {SIGIR} conference on Research and development in information retrieval},
publisher = {{ACM}},
author = {Hein Ragas and Cornelis H. A. Koster},
year = {1998},
keywords = {Analyse de corpus, Classification},
pages = {369--370},
annote = {{{\textless}p{\textgreater}ragasHein1998.pdf{\textless}/p{\textgreater}}} },
-
K. McTait, "A survey of corpus analysis tools," PhD Thesis , 1997.
@phdthesis{mctait_survey_1997, title = {A survey of corpus analysis tools},
abstract = {A corpus analysis tool has been envisaged throughout the course of the {ICLE} project {(International} Corpus of Learner English) at the Centre for English Corpus Linguistics {(CECL),} stemming largely from the dissatisfaction with existing corpus tools. This dissatisfaction comes about largely because more refined linguistic searches and flexible text analyses directly relevant to the {ICLE} project are needed. The following document will subsequently make a comprehensive (or as near to comprehensive as possible) survey of existing corpus analysis tools from both academic and commercial research organisations. The purpose of this survey was essentially to find out whether the corpus tool envisaged by the {ICLE} project already exists, at least in some form or other. However, this survey remains relevant to all researchers in the field as a reference to the state of the art in corpus processing software. In an attempt to provide this document with some sort of structure, the tools discussed have been classified according largely to their level of complexity and sophistication.},
school = {Université catholique de Louvain},
author = {Kevin {McTait}},
month = jul, year = {1997},
note = {manuscrit non publié},
keywords = {Analyse de corpus},
pages = {24 p.},
annote = {{{\textless}p{\textgreater}mctaitKevin1997.pdf{\textless}/p{\textgreater}}} },
-
H. Kucera and N. W. Francis, Computational analysis of present-day American English, Providence: Brown University Press, 1967.
@book{kucera_computational_1967, address = {Providence},
title = {Computational analysis of present-day American English},
publisher = {Brown University Press},
author = {Henry Kucera and Nelson W. Francis},
year = {1967},
keywords = {Analyse de corpus} },
-
B. Hufschmitt and A. Lelu, Organisation d’un corpus philosophique, les oeuvres (françaises) de Descartes.
@misc{hufschmitt_organisation_????, title = {Organisation d'un corpus philosophique, les oeuvres (françaises) de Descartes},
url = {www.sdc2006.org/cdrom/contributions/Hufschmitt-isko-sdc.pdf},
author = {Benoit Hufschmitt and Alain Lelu},
keywords = {Analyse de corpus, Philosophie},
annote = {{{\textless}p{\textgreater}hufschmittBenoit.pdf{\textless}/p{\textgreater}}} },
-
T. Lee, "Constraint-based ontology induction from online customer reviews," Group Decision and Negotiation, vol. 16, pp. 255-281, 2007.
@article{lee_constraint-based_2007, title = {Constraint-based ontology induction from online customer reviews},
volume = {16},
url = {http://www.ingentaconnect.com/content/klu/grup/2007/00000016/00000003/00009065},
abstract = {We present an unsupervised, domain-independent technique for inducing a product-specific ontology of product features based upon online customer reviews. We frame ontology induction as a logical assignment problem and solve it with a bounds consistency constrained logic program. Using shallow natural language processing techniques, reviews are parsed into phrase sequences where each phrase refers to a single concept. Traditional document clustering techniques are adapted to collect phrases into initial concepts. We generate a token graph for each initial concept cluster and find a maximal clique to define the corresponding logical set of concept sub-elements. The logic program assigns tokens to clique sub-elements. We apply the technique to several thousand digital camera customer reviews and evaluate the results by comparing them to the ontologies represented by several prominent online buying guides. Because our results are drawn directly from customer comments, differences between our automatically induced product features and those in extant guides may reflect opportunities for better managing customer-producer relationships rather than errors in the process.},
journal = {Group Decision and Negotiation},
author = {Thomas Lee},
year = {2007},
keywords = {Analyse de texte, Fouille de texte, Ontologie},
pages = {255--281} },
-
A. E. Smith and M. S. Humphreys, "Evaluation of unsupervised semantic mapping of natural language with Leximancer concept mapping," Behavior Research Methods, vol. 38, iss. 2, pp. 262-79, 2006.
@article{smith_evaluation_2006, title = {Evaluation of unsupervised semantic mapping of natural language with Leximancer concept mapping},
volume = {38},
abstract = {The Leximancer system is a relatively new method for transforming lexical cooccurrence information from natural language into semantic patterns in an unsupervised manner. It employs two stages of co-occurrence information extraction - semantic and relational - using a different algorithm for each stage. The algorithms used are statistical, but they employ nonlinear dynamics and machine learning. This article is an attempt to validate the output of Leximancer, using a set of evaluation criteria taken from content analysis that are appropriate for knowledge discovery tasks},
number = {2},
journal = {Behavior Research Methods},
author = {A. E. Smith and M. S. Humphreys},
year = {2006},
keywords = {Analyse de texte, Langage naturel, Recherche d'information, Thésaurus},
pages = {262--79},
annote = {{{\textless}p{\textgreater}Copyright} 2007, The Institution of Engineering and Technology 9338797 {1554-351X} unsupervised semantic mapping evaluation natural language Leximancer concept mapping lexical cooccurrence information transformation semantic cooccurrence information extraction relational cooccurrence information extraction statistical algorithm machine learning content analysis knowledge discovery{\textless}/p{\textgreater}} },
-
R. Mihalcea, "Random walks on text structures." Springer, 2006, pp. 249-262.
@incollection{mihalcea_random_2006, series = {Lecture notes in computer science; 3878},
title = {Random walks on text structures},
isbn = {978-3-540-32205-4},
abstract = {Since the early ages of artificial intelligence, associative or semantic networks have been proposed as representations that enable the storage of language units and the relationships that interconnect them, allowing for a variety of inference and reasoning processes, and simulating some of the functionalities of the human mind. The symbolic structures that emerge from these representations correspond naturally to graphs – relational structures capable of encoding the meaning and structure of a cohesive text, following closely the associative or semantic memory representations. The activation or ranking of nodes in such graph structures mimics to some extent the functioning of human memory, and can be turned into a rich source of knowledge useful for several language processing applications. In this paper, we suggest a framework for the application of graph-based ranking algorithms to natural language processing, and illustrate the application of this framework to two traditionally difficult text processing tasks: word sense disambiguation and text summarization.},
booktitle = {Computational linguistics and intelligent text processing},
publisher = {Springer},
author = {Rada Mihalcea},
year = {2006},
keywords = {Analyse de texte, Architecture},
pages = {249--262},
annote = {{{\textless}p{\textgreater}mihalceaRada2006.pdf{\textless}/p{\textgreater}}} },
-
J. Xing and T. Ah-Hwee, "Mining ontological knowledge from domain-specific text documents," , Houston, TX, USA, 2006, p. 4.
@inproceedings{xing_mining_2006, address = {Houston, {TX,} {USA}},
series = {Proceedings. Fifth {IEEE} International Conference on Data Mining},
title = {Mining ontological knowledge from domain-specific text documents},
abstract = {Traditional text mining systems employ shallow parsing techniques and focus on concept extraction and taxonomic relation extraction. This paper presents a novel system called {CRCTOL} for mining rich semantic knowledge in the form of ontology from domain-specific text documents. By using a full text parsing technique and incorporating both statistical and lexico-syntactic methods, the knowledge extracted by our system is more concise and contains a richer semantics compared with alternative systems. We conduct a case study wherein {CRCTOL} extracts ontological knowledge, specifically key concepts and semantic relations, from a terrorism domain text collection. Quantitative evaluation, by comparing with a state-of-the-art ontology learning system known as text-to-onto, has shown that {CRCTOL} produces much better precision and recall for both concept and relation extraction, especially from sentences with complex structures},
publisher = {{IEEE} Computer Society},
author = {Jiang Xing and Tan {Ah-Hwee}},
year = {2006},
note = {Copyright 2006, The Institution of Engineering and Technology},
keywords = {Analyse de texte, Approche statistique, Fouille de donnée, Ontologie},
pages = {4 pp.},
annote = {{\textless}p{\textgreater}8857416 ontological knowledge mining domain-specific text document text mining full text parsing statistical method lexico-syntactic method concept extraction relation extraction concept relation concept tuple ontology learning{\textless}/p{\textgreater}} },
-
W. M. Pottenger, S. Li, and C. D. Janneck, "Distributed higher-order text mining : theory and practice," , San Diego, California, 2006, pp. 446-447.
@inproceedings{pottenger_distributed_2006, address = {San Diego, California},
title = {Distributed higher-order text mining : theory and practice},
shorttitle = {Distributed higher-order text mining},
url = {http://portal.acm.org/ft_gateway.cfm?id=1146742&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1146598.1146742},
abstract = {This highlight discusses the current and ongoing research into distributed higher-order text mining, as implemented using the {DiHO} {ARM} algorithm in the {D-HOTM} system. The {DiHO} {ARM} algorithm performs association rule mining in the absence of full knowledge of a global schema on distributed data that is neither vertically nor horizontally fragmented. The {D-HOTM} system encapsulates the {DiHO} (and potentially any other) rule-mining algorithm in a distributed system, designed as an extensible digital toolset for data analysts in law enforcement, counterterrorism, health care and other application domains.},
publisher = {{ACM}},
author = {William M. Pottenger and Shenzhi Li and Christopher D. Janneck},
year = {2006},
keywords = {Analyse de texte, Fouille de donnée, Fouille de texte},
pages = {446--447},
annote = {{{\textless}p{\textgreater}pottengerWilliam2006.pdf{\textless}/p{\textgreater}}
}
-
N. Aussenac-Gilles and D. Sörgel, "Text analysis for ontology and terminology engineering," Applied Ontology, vol. 1, iss. 1, pp. 35-46, 2005.
@article{aussenac-gilles_text_2005, title = {Text analysis for ontology and terminology engineering},
volume = {1},
issn = {1570-5838 {(Print)} 1875-8533 {(Online)}},
url = {http://iospress.metapress.com/content/e046qf1uwpdaedj7/},
abstract = {After a recent breakthrough in the early 90's, text analysis is acknowledged as one of the promising ways to rapidly build better grounded semantic resources such as terminologies and ontologies. This domain has recently undergone significant evolutions with a massive reference to machine learning algorithms and information extraction techniques together with linguistic- and statistic-based natural language processing. This position paper promotes three main ideas: (i) that highly domain-specific or task-specific, even idiosyncratic ontologies, are very useful, especially when they are linked to broader consensual schemes and they can be built with reasonable effort; (ii) that corpus-based ontologies can capture the perspective of a domain; and (iii) that supervised ontology learning from text makes feasible the development of specialized ontologies adapted for specific uses. We propose the establishment of an inventory of tools for building ontologies from text, give a first classification of such tools, and present an initial review of some recent methods and tools.},
number = {1},
journal = {Applied Ontology},
author = {Nathalie {Aussenac-Gilles} and Dagobert Sörgel},
year = {2005},
keywords = {Analyse de texte, Ontologie, Terminologie},
pages = {35--46},
annote = {{{\textless}p{\textgreater}aussenac-gillesNathalie.pdf{\textless}/p{\textgreater}}} },
-
N. Aussenac-Gilles, "Supervised text analysis for ontology and terminology engineering," in Machine Learning for the Semantic Web, 2005, pp. 13-18.
@inproceedings{aussenac-gilles_supervised_2005, title = {Supervised text analysis for ontology and terminology engineering},
abstract = {One of the means to reach a Semantic Web is to add some machine readable meta-data to documents (semantic annotations) and/or to improve the performance of information retrieval applications thanks to the use of semantic resources such as terminologies and ontologies. This position paper promotes the idea that these semantic resources cannot be universal but should rather be domain and even task specific in most cases. Moreover, we assert that their content is all the more relevant that it has been defined from document content analysis. And finally, we advocate in favor of a supervised learning process for their design from well selected texts. We shortly present two contributions, a method to build ontologies from texts and a tool for semantic relation identification, that illustrate these positions.},
booktitle = {Machine Learning for the Semantic Web},
author = {Nathalie {Aussenac-Gilles}},
month = feb, year = {2005},
keywords = {Analyse de texte, Ontologie},
pages = {13--18},
annote = {{{\textless}p{\textgreater}aussenac-gillesNathalie.pdf{\textless}/p{\textgreater}}} },
-
B. Grilheres, S. Canu, C. Beauce, and S. Brunessaux, "A platform for semantic annotations and ontology population using conditional random fields," , Compiegne, France, 2005, pp. 790-3.
@inproceedings{grilheres_platform_2005, address = {Compiegne, France},
series = {Proceedings. The 2005 {IEEE/WIC/ACM} International Conference on Web Intelligence},
title = {A platform for semantic annotations and ontology population using conditional random fields},
abstract = {Ontologies are widely used for organising and sharing knowledge. But elaborating these resources is a heavy and time-consuming task. This paper is two-fold: it describes {EADS} {DCS} text-mining platform, in particular, its service to annotate documents with semantic tags and it presents its extension for incremental learning of ontologies. Domain experts are assisted in the ontology population task by recent machine learning techniques (i.e. conditional random fields). Comparisons are made between annotations from the ontology and from a trained {CRF} model, so as to detect candidate instances. An iterative process controlled by the experts results in knowledge discovery and constitution of an accurate ontology},
publisher = {{IEEE} Comput. Soc},
author = {B. Grilheres and S. Canu and C. Beauce and S. Brunessaux},
year = {2005},
note = {Copyright 2006, {IEE}},
keywords = {Analyse de texte, Fouille de donnée, Ontologie},
pages = {790--3},
annote = {{\textless}p{\textgreater}8747803 semantic annotation ontology population conditional random field knowledge organisation knowledge sharing {EADS} {DCS} text-mining document annotation semantic tags incremental learning machine learning knowledge discovery{\textless}/p{\textgreater}} },
-
J. An and Y. P. P. Chen, "Keyword extraction for text categorization," , Kagawa, Japan, 2005, pp. 556-61.
@inproceedings{an_keyword_2005, address = {Kagawa, Japan},
title = {Keyword extraction for text categorization},
abstract = {Text categorization {(TC)} is one of the main applications of machine learning. Many methods have been proposed, such as Rocchio method, Naive bayes based method, and {SVM} based text classification method. These methods learn labeled text documents and then construct a classifier. A new coming text document's category can be predicted. However, these methods do not give the description of each category. In the machine learning field, there are many concept learning algorithms, such as, {ID3} and {CN2.} This paper proposes a more robust algorithm to induce concepts from training examples, which is based on enumeration of all possible keywords combinations. Experimental results show that the rules produced by our approach have more precision and simplicity than that of other methods},
publisher = {{IEEE}},
author = {J. An and Y. P. P. Chen},
year = {2005},
note = {Copyright 2005, {IEE}},
keywords = {Analyse de texte, Classification},
pages = {556--61},
annote = {{\textless}p{\textgreater}8613955 keywords extraction text document categorization machine learning Naive bayes method {SVM} text classification {ID3} concept learning algorithm {CN2} learning algorithm Rocchio method{\textless}/p{\textgreater}} },
-
A. Zanasi, "Text mining tools." Southampton: WIT Press, 2005, pp. 315-327.
@incollection{zanasi_text_2005, address = {Southampton},
title = {Text mining tools},
abstract = {The following companies, their company information and their products in text mining are discussed in this paper: Megaputer intelligence {(Poly} Analyst for Text and {TextAnalyst);} {SAS} {(SAS} Text Miner); {SPSS} {(Text} Mining for Clementine and {LexiQuest} text-mining add-on); Synthema {(TEMIS} Online Miner Light, {4006-LXS);} {TEMIS} {(Insight} discoverer clusterer, Insight discoverer categorizer, Insight discoverer extractor and skills cartridges, Online miner, and Xelda); Autonomy; Clearforest; Convera; Entrieva; Fast Search \& Transfer; {IBM;} Insightful; Inxight; and Verity},
booktitle = {Text mining and its applications to intelligence, {CRM} and knowledge management},
publisher = {{WIT} Press},
author = {A. Zanasi},
year = {2005},
keywords = {Analyse de texte, Fouille de texte},
pages = {315--327} },
-
A. I. Adegorite, O. A. Basir, M. S. Kamel, and K. B. Shaban, "An approach to mining picture objects based on textual cues," in Machine learning and data mining in pattern recognition : 4th international conference, MLDM 2005, Leipzig, Germany, july 9-11, 2005 : proceedings, Berlin; Heidelberg, 2005, pp. 466-475.
@inproceedings{adegorite_approach_2005, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 3587},
title = {An approach to mining picture objects based on textual cues},
abstract = {The task of extracting knowledge from text is an important research problem for information processing and document understanding. Approaches to capture the semantics of picture objects in documents constitute subjects of great interest in the domain of document mining recently. In this paper, we present an approach to extracting information about picture objects in a document using cues from the text written about them. The goal of this work is to mine a document and understand the content of picture objects in the document based on meaning inferred from the texts written about such objects. We apply some natural language processing techniques to extract semantic information about picture objects in a document and process texts written about them. The mining algorithms were developed and implemented as a working system and gone through testing and experimentations. Results and future extensions of the work are discussed in this paper.},
booktitle = {Machine learning and data mining in pattern recognition : 4th international conference, {MLDM} 2005, Leipzig, Germany, july 9-11, 2005 : proceedings},
publisher = {Springer},
author = {A. I. Adegorite and O. A. Basir and M. S. Kamel and K. B. Shaban},
year = {2005},
keywords = {Analyse de texte, Fouille de donnée},
pages = {466--475} },
-
H. Calvo and A. Gelbukh, "Acquiring selectional preferences from untagged text for prepositional phrase attachment disambiguation," in Natural language processing and information systems : 9th international conference on applications of natural language to information systems, NLDB 2004 : proceedings, Salford, UK, 2004, pp. 207-16.
@inproceedings{calvo_acquiring_2004, address = {Salford, {UK}},
series = {Lecture notes in computer science; 3136},
title = {Acquiring selectional preferences from untagged text for prepositional phrase attachment disambiguation},
abstract = {Extracting information automatically from texts for database representation requires previously well-grouped phrases so that entities can be separated adequately. This problem is known as prepositional phrase {(PP)} attachment disambiguation. Current {PP} attachment disambiguation systems require an annotated treebank or they use an Internet connection to achieve a precision of more than 90\%. Unfortunately, these resources are not always available. In addition, using the same techniques that use the Web as corpus may not achieve the same results when using local corpora. In this paper, we present an unsupervised method for generalizing local corpora information by means of semantic classification of nouns based on the top 25 unique beginner concepts of {WordNet.} Then, we propose a method for using this information for {PP} attachment disambiguation},
booktitle = {Natural language processing and information systems : 9th international conference on applications of natural language to information systems, {NLDB} 2004 : proceedings},
publisher = {{Springer-Verlag}},
author = {H. Calvo and A. Gelbukh},
year = {2004},
note = {Copyright 2005, {IEE}},
keywords = {Analyse de texte, Classification, Recherche d'information},
pages = {207--16},
annote = {{\textless}p{\textgreater}8279739 selectional preference acquisition untagged text prepositional phrase attachment disambiguation automatic information extraction database representation annotated treebank Internet connection World Wide Web unsupervised method local corpora information semantic classification {WordNet} well-grouped phrases{\textless}/p{\textgreater}} },
-
C. C. Aggarwal, S. C. Gates, and P. S. Yu, "On using partial supervision for text categorization," IEEE Transactions on Knowledge and Data Engineering, vol. 16, iss. 2, pp. 245-255, 2004.
@article{aggarwal_using_2004, title = {On using partial supervision for text categorization},
volume = {16},
issn = {10414347},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1269601&isnumber=28401},
doi = {10.1109/TKDE.2004.1269601},
abstract = {In this paper, we discuss the merits of building text categorization systems by using supervised clustering techniques. Traditional approaches for document classification on a predefined set of classes are often unable to provide sufficient accuracy because of the difficulty of fitting a manually categorized collection of documents in a given classification model. This is especially the case for heterogeneous collections of Web documents which have varying styles, vocabulary, and authorship. Hence, this paper investigates the use of clustering in order to create the set of categories and its use for classification of documents. Completely unsupervised clustering has the disadvantage that it has difficulty in isolating sufficiently fine-grained classes of documents relating to a coherent subject matter. In this paper, we use the information from a preexisting taxonomy in order to supervise the creation of a set of related clusters, though with some freedom in defining and creating the classes. We show that the advantage of using partially supervised clustering is that it is possible to have some control over the range of subjects that one would like the categorization system to address, but with a precise mathematical definition of how each category is defined. An extremely effective way then to categorize documents is to use this a priori knowledge of the definition of each category. We also discuss a new technique to help the classifier distinguish better among closely related clusters.},
number = {2},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
author = {Charu C. Aggarwal and Stephen C. Gates and Philip S. Yu},
year = {2004},
keywords = {Analyse de texte, Catégorisation, Cluster},
pages = {245--255},
annote = {{{\textless}p{\textgreater}aggarwalCharu2004.pdf{\textless}/p{\textgreater}}} },
-
F. Houben, "Mot vide, mot plein : comment trancher localement," in Rencontre des étudiants chercheurs en informatique pour le traitement des langues (RECITAL), Fès, Maroc, 2004.
@inproceedings{houben_mot_2004, address = {Fès, Maroc},
title = {Mot vide, mot plein : comment trancher localement},
url = {http://aune.lpl.univ-aix.fr/jep-taln04/proceed/actes/recital2004/Houben.rec04.pdf},
abstract = {Nous présentons une méthode multilingue de catégorisation en mot vide / mot plein à partir de corpus brut. Cette méthode fait appel à des propriétés très générales des langues ainsi qu’à des techniques issues de la communauté de la fouille de données.},
booktitle = {Rencontre des étudiants chercheurs en informatique pour le traitement des langues {(RECITAL)}},
author = {Frédérick Houben},
month = apr, year = {2004},
keywords = {Analyse de texte, Fouille de texte},
annote = {{{\textless}p{\textgreater}houbenFrederick2004.pdf{\textless}/p{\textgreater}}} },
-
J. Diesner and K. M. Carley, "Using network text analysis to detect the organizational structure of covert networks," in Proceedings of the north american association for computational social and organizational science (NAACSOS) 2004 conference, Pittsburgh, Pennsylvania, United States, 2004.
@inproceedings{diesner_using_2004, address = {Pittsburgh, Pennsylvania, United States},
title = {Using network text analysis to detect the organizational structure of covert networks},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.61.8346&rep=rep1&type=pdf},
doi = {10.1.1.61.8346},
abstract = {Herein we demonstrate how to get from sets of texts to network representations of covert networks represented in texts. We report on the application of a computer-supported technique that combines network analysis of texts with classifying social and organizational systems into an ontology called the meta-matrix model. The resulting combinatory method is referred to as {Meta-Matrix} Text Analysis. We apply this technique to detect the social and organizational structure of a Mideastern country. Social agents covered in our coding are people and organizations identified by human subject matter experts to be relevant to intelligence matters in that area.},
booktitle = {Proceedings of the north american association for computational social and organizational science {(NAACSOS)} 2004 conference},
author = {Jana Diesner and Kathleen M. Carley},
year = {2004},
keywords = {Analyse de texte, Fouille de texte},
annote = {{{\textless}p{\textgreater}diesnerJana2004.pdf{\textless}/p{\textgreater}}} },
-
T. Virtanen, Approaches to cognition through text and discourse, Berlin ; New York: M. de Gruyter, 2004.
@book{virtanen_approaches_2004, address = {Berlin ; New York},
series = {Trends in linguistics. Studies and monographs ; 147},
title = {Approaches to cognition through text and discourse},
isbn = {3110177919 {(ALK.} {PAPER)}},
publisher = {M. de Gruyter},
author = {Tuija Virtanen},
year = {2004},
keywords = {Analyse de discours, Analyse de texte, Science cognitive},
annote = {{{\textless}p{\textgreater}TOC} à la Library of Congress (http://www.loc.gov/catdir/toc/ecip0421/2004018522.html ){\textless}/p{\textgreater}} },
-
A. Farzindar, G. Lapalme, and Jean-Pierre, "Résumé de textes juridiques par identification de leur structure thématique," Traitement automatique des langues, vol. 45, iss. 1, pp. 39-64, 2004.
@article{farzindar_rsum_2004, title = {Résumé de textes juridiques par identification de leur structure thématique},
volume = {45},
issn = {1248-9433},
url = {http://rali.iro.umontreal.ca/Publications/urls/FarzindarTAL04.pdf},
abstract = {Cet article présente notre méthode de production automatique de résumé de textes juridiques qui permet aux juristes de consulter rapidement les idées-clés d'une décision juridique pour trouver les jurisprudences pertinentes à leurs besoins. La méthodologie repose sur l'exploitation de la structure thématique afin de constituer automatiquement une fiche de résumé augmentant la cohérence et la lisibilité du résumé. La constitution de la fiche de résumé se fait en quatre étapes : la segmentation thématique qui repère la structure du document en quatre thèmes {INTRODUCTION,} {CONTEXTE,} {RAISONNEMENT} {JURIDIQUE} et {CONCLUSION,} le filtrage des unités moins importantes comme les citations des articles des lois, la sélection des unités textuelles saillantes dans les segments thématiques et la production du résumé dans la limite de la taille demandée. La conception des différentes composantes du système, appelé {LetSum,} est décrite en détail ainsi que son implémentation et le résultat d'évaluations.},
number = {1},
journal = {Traitement automatique des langues},
author = {Atefeh Farzindar and Guy Lapalme and {Jean-Pierre} Descles},
year = {2004},
keywords = {Analyse de texte, Linguistique},
pages = {39--64},
annote = {{{\textless}p{\textgreater}farzindarAtefeh2004.pdf{\textless}/p{\textgreater}}} },
-
D. Mertz, Text processing in Python, Boston: Addison-Wesley, 2003.
@book{mertz_text_2003, address = {Boston},
title = {Text processing in Python},
isbn = {0321112547},
publisher = {{Addison-Wesley}},
author = {David Mertz},
year = {2003},
keywords = {Analyse de texte} },
-
B. Étienne, "Peut-on mesurer la distance entre deux textes?," Corpus, iss. 2, 2003.
@article{tienne_peut-mesurer_2003, series = {La distance intellectuelle},
title = {Peut-on mesurer la distance entre deux textes?},
shorttitle = {Peut-on mesurer la distance entre deux textes?},
url = {http://corpus.revues.org/document30.html},
abstract = {Le présent exposé tente d’explorer et de comparer les méthodes qu’on a proposées jusqu’ici pour mesurer la distance entre deux textes. Les formules sont diverses, et s’appliquent tantôt à la fréquence, tantôt à la présence / absence. Et l’objet mesuré varie grandement (graphies, n‑grammes, lemmes, classes de fréquence, codes grammaticaux, structures syntaxiques ou sémantiques). L’expérience montre pourtant que la convergence est au rendez-vous.},
number = {2},
journal = {Corpus},
author = {Brunet Étienne},
month = dec, year = {2003},
keywords = {Analyse de texte, Fouille de texte},
annote = {{{\textless}p{\textgreater}brunetEtienne2003.pdf{\textless}/p{\textgreater}}} },
-
D. J. Lawrie, "Language models for hierarchical summarization," PhD Thesis , 2003.
@phdthesis{lawrie_language_2003, type = {{Ph.D.} en pilosophie},
title = {Language models for hierarchical summarization},
abstract = {Hierarchies have long been used for organization, summarization, and access to information. In this dissertation we define summarization in terms of a probabilistic language model and use this definition to explore a new technique for automatically generating topic hierarchies. We use the language model to characterize the documents that will be summarized and then apply a graph-theoretic algorithm to determine the best topic words for the hierarchical summary. This work is very different from previous attempts to generate topic hierarchies because it relies on statistical analysis and language modeling to identify descriptive words for a document and organize the words in a hierarchical structure. We compare our new technique to previous methods proposed for constructing topic hierarchies, including subsumption and lexical hierarchies. We also compare the words chosen to be part of the hierarchy to the top ranked words using {TF.IDF} in terms of how well each summarizes the document set. Our results show that the language modeling approach performs as well as or better than these other techniques in non user-based evaluations. We also show that the hierarchies provide better access to the documents described in the summary than does a ranked list using one of the non-user based evaluations we have developed. In a user study that compares the ability of users to find relevant instances using both the hierarchy and a ranked list to using the ranked list alone, we find that users like the information provided by the hierarchy and after some practice can use it as effectively as they can a ranked list.},
school = {University of Massachusetts Amherst},
author = {Dawn J. Lawrie},
month = sep, year = {2003},
keywords = {Analyse de texte, Langage naturel},
pages = {218 p.},
annote = {{{\textless}p{\textgreater}lawrieDawn2003.pdf{\textless}/p{\textgreater}}} },
-
Jean-Frédéric and Jean-Guy, "Categorisation techniques in computer assisted-reading and analysis of texts (CARAT) in the humanities," Computers and the Humanities, vol. 37, iss. 1, pp. 111-118, 2003.
@article{de_pasquale_categorisation_2003, title = {Categorisation techniques in computer assisted-reading and analysis of texts {(CARAT)} in the humanities},
volume = {37},
issn = {00104817 {(Print);} 15728412 {(Online)}},
url = {http://www.springerlink.com/content/nj2525027814q0x7/fulltext.pdf},
doi = {10.1023/A:1021855607270},
abstract = {There are two important strategies in computer-assisted reading and analysis of text {(CARAT).} The first relates to the classification process, and the second pertains to the categorisation process. These two often-interrelated operations have been regularly recognised as essential components of text analysis. However, the two operations are highly time-consuming. A possible solution to this problem calls upon more inductive or bottom-up strategies that are numerical and statistical in nature. In our own research, we have been exploring a few of these techniques and their combination. We now know, through our own past research and others' work, that the classification methods allow a good empirical thematic exploration of a corpus. More specifically, in this paper we shall concentrate on the problem of assisting the automatic categorisation of small segments of a philosophical text into a set of thematic categories.},
number = {1},
journal = {Computers and the Humanities},
author = {{Jean-Frédéric} de Pasquale and {Jean-Guy} Meunier},
year = {2003},
keywords = {Analyse de texte, Catégorisation, Classification},
pages = {111--118},
annote = {{{\textless}p{\textgreater}de\_pasqualeJean-frederic2003{\textless}/p{\textgreater}}} },
-
L. O. H. Stanley, "A tourism recommender system based on collaboration and text analysis," Information technology \& tourism, vol. 6, pp. 157-165, 2003.
@article{stanley_tourism_2003, title = {A tourism recommender system based on collaboration and text analysis},
volume = {6},
url = {http://www.ingentaconnect.com/content/cog/itt/2003/00000006/00000003/art00002},
abstract = {This work presents a recommender system that helps travel agents in discovering options for customers, especially those who do not know where to go and what to do. The system analyzes textual messages exchanged between a travel agent and a customer through a private Web chat. Text mining techniques help discover interesting areas in the messages. After that, the system searches a database and retrieves tourist options (like cities and attractions) classified in these interesting areas. The system makes use of a tourism ontology, containing themes and a controlled vocabulary, to identify themes in the textual messages. The system acts as a decision support system, because it does not make recommendations directly to the customer.},
journal = {Information technology \& tourism},
author = {L. O. H. Stanley},
year = {2003},
keywords = {Analyse de texte, Fouille de texte},
pages = {157--165} },
-
M. A. K. Halliday and J. J. Webster, Linguistic studies of text and discourse, London: Continuum, 2002.
@book{halliday_linguistic_2002, address = {London},
title = {Linguistic studies of text and discourse},
isbn = {0826458688},
publisher = {Continuum},
author = {M. A. K. Halliday and Jonathan James Webster},
year = {2002},
keywords = {Analyse de discours, Analyse de texte, Linguistique} },
-
W. Kintsch, M. Louwerse, and W. V. Peer, "On the notions of theme and topic in psychologial proess models of text comprehension." Amsterdam: John Benjamin, 2002, pp. 157-170.
@incollection{kintsch_notions_2002, address = {Amsterdam},
title = {On the notions of theme and topic in psychologial proess models of text comprehension},
booktitle = {Thematics: interdisciplinary studies},
publisher = {John Benjamin},
author = {Walter Kintsch and Max Louwerse and Willie Van Peer},
year = {2002},
keywords = {Analyse de texte, Science cognitive},
pages = {157--170} },
-
D. Forest, "Lecture et analyse de textes philosophiques assistées par ordinateur : application d’une approche classificatoire mathématique à l’analyse thématique du « Discours de la méthode » et des « Méditations métaphysiques » de Descartes," PhD Thesis , 2002.
@phdthesis{forest_lecture_2002, title = {Lecture et analyse de textes philosophiques assistées par ordinateur : application d'une approche classificatoire mathématique à l'analyse thématique du « Discours de la méthode » et des « Méditations métaphysiques » de Descartes},
school = {Université du Québec à Montréal, Philosophie},
author = {Dominic Forest},
year = {2002},
note = {Maîtrise en philosophie},
keywords = {Analyse de texte, Fouille de texte, Philosophie},
pages = {131} },
-
. Torres-Moreno, P. Velásquez-Morales, and Jean-Guy, "Condensés de textes par des méthodes numériques," , 2002.
@article{torres-moreno_condenss_2002, series = {6es journées internationales d’analyse statistique des données textuelles},
title = {Condensés de textes par des méthodes numériques},
abstract = {Since information in electronic form is already a standard, and that the variety and the quantity of information become increasingly large, the methods of summarizing or automatic condensation of texts is a critical phase of the analysis of texts. This article describes Cortex a system based on numerical methods, which allows obtaining a condensation of a text, which is independent of the topic and of the length of the text. The structure of the system enables it to find the abstracts in French or Spanish in very short times. Étant donné que la variété et la quantité de l’information sous forme électronique deviennent de plus en plus grandes, des méthodes d’obtention de résumés ou de condensation automatique de textes constituent une phase critique de l’analyse de textes. Cet article décrit Cortex, un système basé sur des méthodes numériques qui permet l’obtention d’un condensé d’un texte, qui est indépendant du thème, de l’ampleur du texte et de la façon dont il est écrit. La structure du système lui permet de trouver la condensation de textes multilangues dans des temps très courts. Des applications en français ou espagnol sont présentées et analysées.},
author = {{Juan-Manuel} {Torres-Moreno} and Patricia {Velásquez-Morales} and {Jean-Guy} Meunier},
year = {2002},
keywords = {Analyse de texte, Approche statistique, Catégorisation} },
-
C. Barriere, "Hierarchical refinement and representation of the causal relation," Terminology, vol. 8, iss. 1, pp. 91-111, 2002.
@article{barriere_hierarchical_2002, title = {Hierarchical refinement and representation of the causal relation},
volume = {8},
abstract = {This research looks at the complexity inherent in the causal relation and the implications for its representation in a terminological knowledge base {(TKB).} Supported by a more general study of semantic relation hierarchies, a hierarchical refinement of the causal relation is proposed. It results from a manual search of a corpus which shows that it efficiently captures and formalizes variations expressed in text. The feasibility of determining such categorization during automatic extraction from corpora is also explored. Conceptual graphs are used as a representation formalism to which we have added certainty information to capture the degree of certainty surrounding the interaction between two terms involved in a causal relation},
number = {1},
journal = {Terminology},
author = {C. Barriere},
year = {2002},
keywords = {Analyse de texte, Graphique},
pages = {91--111},
annote = {{{\textless}p{\textgreater}Copyright} 2002, {IEE} 7389562 0929-9971 hierarchical refinement causal relation representation terminological knowledge base semantic networks semantic relation hierarchies corpus terminology text analysis categorization conceptual graphs certainty information{\textless}/p{\textgreater}} },
-
N. Fourour and E. Morin, "Apport du Web dans la reconnaissance des entités nommées," in Taln, Web et corpus. Colloque, Saint-Denis , FRANCE, Sillery, Québec, Canada, 2002.
@inproceedings{fourour_apport_2002, address = {Sillery, Québec, Canada},
title = {Apport du Web dans la reconnaissance des entités nommées},
volume = {32},
abstract = {Cet article présente une étude sur l'apport que peut fournir le Web dans la reconnaissance des entités nommées pour le français. Cette étude a engendré l'implémentation d'un nouveau module de notre système de reconnaissance des entités nommées {(Némésis).} Ce module nous a permis d'évaluer l'apport de l'utilisation du Web dans cette tâche et de dégager un certain nombre d'heuristiques pour ce module. Les performances atteintes par Némésis, sur l'ensemble des entités nommées, étaient de 79 \% pour le rappel et 91 \% pour la précision. Le gain en rappel s'élève à plus de 5 \%, tandis que la perte en précision reste faible (environ 2 \%).},
booktitle = {Taln, Web et corpus. Colloque, {Saint-Denis} , {FRANCE}},
publisher = {Presses de {l'Université} du Québec},
author = {Nordine Fourour and Emmanuel Morin},
year = {2002},
keywords = {Analyse de texte, Web},
annote = {{{\textless}p{\textgreater}fourourNordine2002.pdf{\textless}/p{\textgreater}}} },
-
T. Nasukawa and T. Nagano, "Text analysis and knowledge mining system," IBM systems journal, vol. 40, iss. 4, pp. 967-984, 2001.
@article{nasukawa_text_2001, title = {Text analysis and knowledge mining system},
volume = {40},
issn = {0018-8670},
url = {http://www.research.ibm.com/journal/sj/404/nasukawa.pdf},
doi = {10.1147/sj.404.0967},
abstract = {Large text databases potentially contain a great wealth of knowledge. However, text represents factual information (and information about the author's communicative intentions) in a complex, rich, and opaque manner. Consequently, unlike numerical and fixed field data, it cannot be analyzed by standard statistical data mining methods. Relying on human analysis results in either huge workloads or the analysis of only a tiny fraction of the database. We are working on text mining technology to extract knowledge from very large amounts of textual data. Unlike information retrieval technology that allows a user to select documents that meet the user's requirements and interests, or document clustering technology that organizes documents, we focus on finding valuable patterns and rules in text that indicate trends and significant features about specific topics. By applying our prototype system named {TAKMI} {(Text} Analysis and Knowledge Mining) to textual databases in {PC} help centers, we can automatically detect product failures; determine issues that have led to rapid increases in the number of calls and their underlying reasons; and analyze help center productivity and changes in customers' behavior involving a particular product, without reading any of the text. We have verified that our framework is also effective for other data such as patent documents.},
number = {4},
journal = {{IBM} systems journal},
author = {T. Nasukawa and T. Nagano},
year = {2001},
keywords = {Analyse de texte, Fouille de texte},
pages = {967--984},
annote = {{{\textless}p{\textgreater}nasukawaT2001.pdf{\textless}/p{\textgreater}}} },
-
P. Woojin, S. Yilmazel, E. Brown, M. Poulin, S. Dubon, and C. Amice, "Applying natural language processing (NLP) based metadata extraction to automatically acquire user preferences," , Victoria, BC, Canada, 2001, pp. 116-22.
@inproceedings{woojin_applying_2001, address = {Victoria, {BC,} Canada},
series = {Proceedings of the First International Conference on Knowledge Capture},
title = {Applying natural language processing {(NLP)} based metadata extraction to automatically acquire user preferences},
abstract = {This paper describes a metadata extraction technique based on natural language processing {(NLP)} which extracts personalized information from email communications between financial analysts and their clients. Personalized means connecting users with content in a personally meaningful way to create, grow, and retain online relationships. Personalization often results in the creation of user profiles that store individuals' preferences regarding goods or services offered by various e-commerce merchants. With the introduction of e-commerce, it has become more difficult to develop and maintain personalized information due to larger transaction volumes. {{\textless}!metaMarker{\textgreater}} is an {NLP} and machine learning {(ML)-based} automatic metadata extraction system designed to process textual data such as emails, discussion group postings, or chat group transcriptions. {{\textless}!metaMarker{\textgreater}} extracts both explicit and implicit metadata elements including proper names, numeric concepts, and topic/subject information. In addition, Speech Act Theory inspired metadata elements, which represent the message creators' intention, mood, and urgency are also extracted. In a typical dialogue between financial analysts and their clients, clients often discuss the items that they liked or have an interest. By extracting this information, {{\textless}!metaMarker{\textgreater}} constructs user profiles automatically. This system has been designed, implemented, and tested with real-world data. The overall accuracy and coverage of extracting explicit and implicit metadata is about 90\%. In summary, the paper shows that an {NLP-based} metadata extraction system enables automatic user profiling with high effectiveness},
publisher = {{ACM}},
author = {Paik Woojin and S. Yilmazel and E. Brown and M. Poulin and S. Dubon and C. Amice},
year = {2001},
note = {Copyright 2003, {IEE}},
keywords = {Analyse de texte, Langage naturel},
pages = {116--22},
annote = {{\textless}p{\textgreater}7540433 natural language processing based metadata extraction automatic user preference acquisition personalized information extraction email communications online relationships user profiles e-commerce merchants {\<!metaMarker\>} machine learning textual data processing discussion group postings chat group transcriptions explicit metadata extraction implicit metadata extraction proper names numeric concepts topic information subject information speech act theory message creator intention message creator mood message creator urgency financial analyst/client dialogue{\textless}/p{\textgreater}} },
-
Y. Yang, "A study of thresholding strategies for text categorization," in Proceedings of the 24th annual international ACM SIGIR conference on Research and development in information retrieval, New Orleans, Louisiana, United States, 2001, pp. 137-145.
@inproceedings{yang_study_2001, address = {New Orleans, Louisiana, United States},
title = {A study of thresholding strategies for text categorization},
isbn = {1-58113-331-6},
url = {http://portal.acm.org/citation.cfm?id=383975&dl=ACM&coll=GUIDE},
doi = {10.1145/383952.383975},
abstract = {Thresholding strategies in automated text categorization are an underexplored area of research. This paper presents an examination of the effect of thresholding strategies on the performance of a classifier under various conditions. Using {k-Nearest} Neighbor {(kNN)} as the classifier and five evaluation benchmark collections as the testbets, three common thresholding methods were investigated, including rank-based thresholding {(RCut),} proportion-based assignments {(PCut)} and score-based local optimization {(SCut);} in addition, new variants of these methods are proposed to overcome significant problems in the existing approaches. Experimental results show that the choice of thresholding strategy can significantly influence the performance of {kNN,} and that the ``optimal'' strategy may vary by application. {SCut} is potentially better for fine-tuning but risks overfitting. {PCut} copes better with rare categories and exhibits a smoother trade-off in recall versus precision, but is not suitable for online decision making. {RCut} is most natural for online response but is too coarse-grained for global or local optimization. {RTCut,} a new method combining the strength of category ranking and scoring, outperforms both {PCut} and {RCut} significantly.},
booktitle = {Proceedings of the 24th annual international {ACM} {SIGIR} conference on Research and development in information retrieval},
publisher = {{ACM}},
author = {Yiming Yang},
year = {2001},
keywords = {Analyse de texte, Analyse documentaire, Catégorisation},
pages = {137--145},
annote = {{{\textless}p{\textgreater}yangYiming2001.pdf{\textless}/p{\textgreater}}} },
-
R. Popping, Computer-assisted text analysis, London: Sage, 2000.
@book{popping_computer-assisted_2000, address = {London},
series = {New Technologies for Social Research},
title = {Computer-assisted text analysis},
abstract = {This book provides an up-to-date picture of the main methods for the quantitative analysis of text. Popping begins by overviewing the background and the conceptual foundations of the field, introducing the latest developments. He then concentrates on a comprehensive coverage of the traditional thematic approaches of text analysis, followed by the newer developments in semantic and network text analysis methodologies. Finally, the author examines the relationship between content analysis and other kinds of text analysis - from qualitative research, linguistic analysis and information retrieval. Computer-assisted Text Analysis focuses on the methodological and practical issues of coding and handling data, including sampling, reliability and validity issues, and includes a useful appendix of computer programs for text analysis. The methods described are applicable across a wide range of disciplines in the social sciences and humanities as well as, for example, practitioners from the fields of political science, journalism., communication, marketing and information systems.},
publisher = {Sage},
author = {Roel Popping},
year = {2000},
keywords = {Analyse de texte, Intelligence artificielle} },
-
D. Forest and Jean-Guy, "La classification mathématique des textes : un outil d’assistance à la lecture et à l’analyse des textes philosophiques," , 2000.
@article{forest_la_2000, series = {5es journées internationales d'analyse statistique des données textuelles},
title = {La classification mathématique des textes : un outil d'assistance à la lecture et à l'analyse des textes philosophiques},
author = {Dominic Forest and {Jean-Guy} Meunier},
month = mar, year = {2000},
keywords = {Analyse de texte, Classification, Fouille de texte, Philosophie} },
-
M. Alexa and C. Zuell, A review of software for text analysis, Mannheim: Zuma, 1999.
@book{alexa_review_1999, address = {Mannheim},
title = {A review of software for text analysis},
isbn = {3924220166},
abstract = {The book reviews a selection of software for computer-assisted text analysis. The primary aim is to provide a detailed (and up-to-date) account of the spectrum of available text analysis software and catalogue the kinds of support the selected software offers to the user. A related, more general, goal is to record the tendencies both in functionality and technology and identify the areas where more development is needed. For this reason the presented selection of software comprises not only fully developed commercial and research programs, but also prototypes and beta versions. An additional aspect with regards to the kinds of software reviewed is that both qualitative and quantitative-oriented types of research are included. Depending on research purposes and project design the text analyst can profit from available tools independently of their orientation. Today it is often the case that in computational support, the borderline between quantitative and qualitative methodologies can become 'obscure'; instead, one can detect a number of commonalities which can be placed within a broader text analysis context. The following fifteen programs are reviewed: {AQUAD,} {ATLAS.ti,} {CoAN,} {Code-A-Text,} {DICTION,} {DIMAP-MCCA,} {HyperRESEARCH,} {KEDS,} {NUD*IST,} {QED,} {TATOE,} {TEXTPACK,} {TextSmart,} {WinMAXpro,} and {WordStat} and the criteria and methodology used for selecting them are delineated. The last part of the book contains an extensive discussion about text analysis programs and the concrete issues raised from the review.},
publisher = {Zuma},
author = {Melina Alexa and Cornelia Zuell},
year = {1999},
keywords = {Analyse de texte},
annote = {{{\textless}p{\textgreater}alexaMelina2001.pdf{\textless}/p{\textgreater}}} },
-
Jean-Guy, L. Remaki, and D. Forest, "Use of classifiers in Computer-Assisted Reading and Analysis of Text (CARAT)," , Las Vegas, 1999.
@inproceedings{meunier_use_1999, address = {Las Vegas},
title = {Use of classifiers in {Computer-Assisted} Reading and Analysis of Text {(CARAT)}},
author = {{Jean-Guy} Meunier and L. Remaki and Dominic Forest},
month = jul, year = {1999},
keywords = {Analyse de texte, Fouille de texte} },
-
A. Ballim, V. Pallotta, and C. Lieske, "Robust text analysis : an overview," , p. 21, 1999.
@article{ballim_robust_1999, title = {Robust text analysis : an overview},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.43.373},
doi = {10.1.1.43.373},
author = {Afzal Ballim and Vincenzo Pallotta and Christian Lieske},
month = may, year = {1999},
keywords = {Analyse de texte, Fouille de texte},
pages = {21 p.},
annote = {{{\textless}p{\textgreater}ballimAfzal1999.pdf{\textless}/p{\textgreater}}} },
-
M. Alexa and C. Zuell, "Commonalities, differences and limitations of text analysis software : the results of a review," Quality and Quantity, vol. 34, iss. 3, pp. 299-321, 1999.
@article{alexa_commonalities_1999, title = {Commonalities, differences and limitations of text analysis software : the results of a review},
volume = {34},
issn = {0033-5177 {(Print);} 1573-7845 {(Online)}},
url = {http://www.springerlink.com/content/m5767p535136087p/fulltext.pdf},
doi = {10.1023/A:1004740203542},
abstract = {This paper discusses on the one hand the tendencies in functionality and technology of software for text analysis and reflects, on the other hand, on the areas where more development is needed. The basis for this discussion forms a comprehensive review {(Alexa} \& Zuell, in press) of fifteen currently available software for text analysis. In the review each software package, i.e. {AQUAD,} {ATLAS.ti,} {CoAN,} {Code-A-Text,} {DICTION,} {DIMAP-MCCA,} {HyperRESEARCH,} {KEDS,} {NUD*IST,} {QED,} {TATOE,} {TEXTPACK,} {TextSmart,} {WinMAXpro,} and {WordStat,} was presented in a detailed and extensive manner. In this paper we shall only delineate our methodology and criteria for selecting which programs to review and concentrate on discussing the types of support the selected programs offer, the commonalities and differences of their functionality, point to some of their shortcomings and put forward suggestions for future development.},
number = {3},
journal = {Quality and Quantity},
author = {Melina Alexa and Cornelia Zuell},
month = jun, year = {1999},
keywords = {Analyse de texte},
pages = {299--321},
annote = {{{\textless}p{\textgreater}alexaMelina1999.pdf{\textless}/p{\textgreater}}} },
-
M. Alexa, Computer assisted text analysis methedology in the social sciences, 1997.
@misc{alexa_computer_1997, title = {Computer assisted text analysis methedology in the social sciences},
url = {http://www.gesis.org/publikationen/Berichte/ZUMA_Arbeitsberichte/97/97_07.pdf},
abstract = {The report presents an account of methods of research in computer-assisted text analysis in the social sciences. Rather than to provide a comprehensive enumeration of all computer-assisted text analysis investigations either directly or indirectly related to the social sciences using a quantitative and computer-assisted methodology as their text analytical tool, the aim of the report is to describe the current methodological standpoint of computer-assisted text analysis in the social sciences. The report provides, thus, a description and a discussion of the operations carried out in computer-assisted text analysis investigations. The report examines both past and well-established as well as some of the current approaches in the field and describes the techniques and the procedures involved. By this means, a first attempt is made toward cataloguing the kinds of supplementary information as well as computational support which are further required to expand the suitability and applicability of the method for the variety of text analysis goals.},
author = {Melina Alexa},
month = oct, year = {1997},
keywords = {Analyse de texte},
annote = {{{\textless}p{\textgreater}alexaMelina1997.pdf{\textless}/p{\textgreater}}} },
-
J. M. Salanskis, F. Rastier, R. Scheps, and C. culturel international de Cerisy-la-Salle, Herméneutique : textes, sciences, Paris: Presses universitaires de France, 1997.
@book{salanskis_hermneutique_1997, address = {Paris},
series = {Philosophie d'aujourd'hui},
title = {Herméneutique : textes, sciences},
isbn = {2130486762},
publisher = {Presses universitaires de France},
author = {J. M. Salanskis and François Rastier and R. Scheps and Centre culturel international de {Cerisy-la-Salle}},
year = {1997},
keywords = {Analyse de texte, Linguistique} },
-
Jean-Guy, "La lecture et l’analyse de texte assistées par ordinateur (LATAO) comme sytème de traitement d’information," Sciences cognitives, vol. 22, pp. 211-223, 1997.
@article{meunier_la_1997, title = {La lecture et l'analyse de texte assistées par ordinateur {(LATAO)} comme sytème de traitement d'information},
volume = {22},
abstract = {La venue récente des technologies du cédérom et encore plus de l'inforoute a mis en évidence les difficultés du traitement de l'information textuelle électronique. Il ne suffit plus simplement de stocker des textes ou de les retrouver, encore faut-il souvent en découvrir le contenu. malgré la richesse et la pertinence du développement technologique, peu de réflexions ont été faites sur le fondement théorique de cette technologie de la lecture et de l'analyse de texte assistées par ordinateur. {L'A.} examine une hypothèse selon laquelle l'accès informatique au contenu d'un texte pourrait être conçu comme un système de traitement d'information. L'objectif visé par une telle analyse est de comprendre davantage les paramètres qui constituent la technologie informatique de la lecture et de l'analyse de texte assistée par ordinateur.},
journal = {Sciences cognitives},
author = {{Jean-Guy} Meunier},
year = {1997},
keywords = {Analyse de texte, Langage naturel},
pages = {211--223} },
-
H. Ahonen, O. Heinonen, M. Klemettinen, and I. A. Verkamo, "Applying data mining techniques in text analysis," , 1997.
@article{ahonen_applying_1997, title = {Applying data mining techniques in text analysis},
doi = {10.1.1.52.7756},
abstract = {A number of recent data mining techniques have been targeted especially for the analysis of sequential data. Traditional examples of sequential data involve telecommunication alarms, Www log files, user action registration for Hci studies, or any other series of events consisting of an event type and a time of occurrence. Text can also be seen as sequential data, in many respects similar to the data collected by sensors, or other observation systems. Traditionally, texts have been analysed using various information retrieval related methods, such as full-text analysis, and natural language processing. However, only few examples of data mining in text, particularly in full text, are available. In this paper we show that general data mining methods are applicable to text analysis tasks under certain conditions. Moreover, we present a general framework for text mining. The framework follows the general Kdd process, thus containing steps from preprocessing to the utilization of the results. The data mining method that we apply is based on generalized episodes and episode rules. We consider preprocessing of the text to be essential in text mining: by shifting the focus in the preprocessing phase, data mining can be used to obtain results for various purposes. We give concrete examples of how to preprocess texts based on the intended use of the discovered results and how to balance preprocessing with postprocessing. We also present example applications including search for key words, key phrases and other co-occurring words, e.g. collocations and generalized concordances. These applications are both common and relevant tasks in information retrieval and natural language processing. We also present results from real-life data experiments to show that our approach is applicable in practice.},
author = {Helena Ahonen and Oskari Heinonen and Mika Klemettinen and A. Inkeri Verkamo},
year = {1997},
keywords = {Analyse de texte, Fouille de donnée, Fouille de texte},
annote = {{{\textless}p{\textgreater}ahonenHelena1997.pdf{\textless}/p{\textgreater}}} },
-
G. Salton, A. Singhal, C. Buckley, and M. Mitra, "Automatic text decomposition using text segments and text themes," in Proceedings of the the seventh ACM conference on Hypertext, Bethesda, Maryland, United States, 1996, pp. 53-65.
@inproceedings{salton_automatic_1996, address = {Bethesda, Maryland, United States},
title = {Automatic text decomposition using text segments and text themes},
isbn = {0-89791-778-2},
url = {http://portal.acm.org/citation.cfm?id=234834&dl=ACM&coll=GUIDE},
doi = {10.1145/234828.234834},
booktitle = {Proceedings of the the seventh {ACM} conference on Hypertext},
publisher = {{ACM}},
author = {Gerard Salton and Amit Singhal and Chris Buckley and Mandar Mitra},
year = {1996},
keywords = {Analyse de texte, Fouille de texte},
pages = {53--65},
annote = {{{\textless}p{\textgreater}saltonGerard1981.pdf{\textless}/p{\textgreater}}} },
-
C. Zuell, J. Harkness, and J. H. P. Hoffmeyer-Zlotnick, Text analysis and computers, Zuma ed., Mannheim, Germany: , 1996, vol. 1.
@book{zuell_text_1996, address = {Mannheim, Germany},
edition = {Zuma},
series = {{ZUMA-Nachrichten} Spezial},
title = {Text analysis and computers},
volume = {1},
isbn = {3- 924220- 11- 5},
url = {http://www.gesis.org/Publikationen/Zeitschriften/ZUMA_Nachrichten_spezial/documents/pdfs/znspezial1.pdf},
author = {Cornelia Zuell and Janet Harkness and Juergen H. P. {Hoffmeyer-Zlotnick}},
year = {1996},
keywords = {Analyse de texte, Informatique},
annote = {{{\textless}p{\textgreater}zuellCornelia1996.pdf{\textless}/p{\textgreater}}} },
-
Jean-Guy, "La lecture et l’analyse de texte assistée par ordinateur : la chaîne d’analyse," Cahiers de recherche du Laboratoire d’ANalyse Cognitive de l’Information, vol. 6, 1995.
@article{meunier_la_1995, title = {La lecture et l'analyse de texte assistée par ordinateur : la chaîne d'analyse},
volume = {6},
journal = {Cahiers de recherche du Laboratoire {d'ANalyse} Cognitive de {l'Information}},
author = {{Jean-Guy} Meunier},
year = {1995},
keywords = {Analyse de texte} },
-
F. Rastier, L’analyse thématique des données textuelles : l’exemple des sentiments, Paris: Didier Erudition, 1995.
@book{rastier_lanalyse_1995, address = {Paris},
series = {{L'Analyse} thématique des données textuelles : l'exemple des sentiments},
title = {L'analyse thématique des données textuelles : l'exemple des sentiments},
url = {http://www.revue-texto.net/Parutions/Analyse-thematique/Analyse-thematique.html},
abstract = {Ce volume s'organise autour d'un problème, d'un outil informatique et d'un corpus. Le problème intéresse toute description sémantique de textes : comment définir et identifier des thèmes, retracer leurs liens privilégiés, dessiner leur évolution diachronique ? Comme l'expansion des banques de données textuelles suscite des besoins croissants, le développement de la thématique revêt un intérêt considérable, tant pour l'indexation que pour l'exploitation des textes. Les enjeux de l'ouvrage pourront être récapitulés dans le chapitre final ; il faut avant tout justifier ici le choix des données. Nous en sommes restés au discours littéraire, car les textes littéraires sont les plus nombreux dans la banque Frantext. Nous avons sélectionné un corpus de romans, genre textuel le mieux représenté : il regroupe 397 tomes de romans et recueils de nouvelles français publiés de 1830 à 1970, soit 350 œuvres. Ce nombre assure au corpus une masse critique qui permet des traitements statistiques significatifs et surtout se prête à débats et conjectures. Les bornes chronologiques se justifient ainsi : l'unification des normes typographiques remonte à 1827, et la date de 1830 met à l'abri de variations qui gêneraient l'interrogation. Après 1970, le corpus des romans saisi est insuffisant, en quantité sinon en qualité. On verra que cette étendue temporelle permet de retracer des évolutions significatives. Ce volume présente la première étude thématique sur un corpus romanesque d'une telle ampleur : il pose ainsi les problèmes à une toute autre échelle que les monographies dont on dispose généralement. En choisissant le thème des sentiments, nous ne pensions peut-être ne guère prendre de risques, mais les résultats de l'enquête engagent à reconsidérer bien des idées reçues.},
publisher = {Didier Erudition},
author = {François Rastier},
year = {1995},
keywords = {Analyse de texte, Fouille de texte},
annote = {{{\textless}p{\textgreater}rastierFrancois1995.pdf{\textless}/p{\textgreater}}} },
-
L. Lebart and A. Salem, Statistique textuelle, Paris: Dunod, 1994.
@book{lebart_statistique_1994, address = {Paris},
title = {Statistique textuelle},
isbn = {2100022393},
publisher = {Dunod},
author = {Ludovic Lebart and André Salem},
year = {1994},
keywords = {Analyse de texte, Approche statistique} },
-
B. Gervais, À l’écoute de la lecture, Montréal: VLB, 1993.
@book{gervais__1993, address = {Montréal},
title = {À l'écoute de la lecture},
publisher = {{VLB}},
author = {Bertrand Gervais},
year = {1993},
keywords = {Analyse de texte} },
-
M. A. Hearst and C. Plaunt, "Subtopic structuring for full-length document access," , Pittsburgh, Pa., 1993, pp. 59-68.
@inproceedings{hearst_subtopic_1993, address = {Pittsburgh, Pa.},
title = {Subtopic structuring for full-length document access},
abstract = {We argue that the advent of large volumes of full-length text, as opposed to short texts like abstracts and newswire, should be accompanied by corresponding new approaches to information access. Toward this end, we discuss the merits of imposing structure on full-length text documents; that is, a partition of the text into coherent multi-paragraph units that represent the pattern of subtopics that comprise the text. Using this structure, we can make a distinction between the main topics, which occur throughout the length of the text, and the subtopics, which are of only limited extent. We discuss why recognition of subtopic structure is important and how, to some degree of accuracy, it can be found. We describe a new way of specifying queries on full-length documents and then describe an experiment in which making use of the recognition of local structure achieves better results on a typical information retrieval task than does a standard {IR} measure.},
author = {Marti A. Hearst and Christian Plaunt},
year = {1993},
keywords = {Analyse de texte, Document numérique},
pages = {59--68} },
-
L. Lebart and A. Salem, Analyse statistique des données textuelles, Paris: Dunod, 1988.
@book{lebart_analyse_1988, address = {Paris},
title = {Analyse statistique des données textuelles},
publisher = {Dunod},
author = {Ludovic Lebart and A. Salem},
year = {1988},
keywords = {Analyse de texte} },
-
R. Giora, "Notes toward a theory of text coherence," Poetics Today, vol. 6, iss. 4, pp. 699-715, 1985.
@article{giora_notes_1985, title = {Notes toward a theory of text coherence},
volume = {6},
number = {4},
journal = {Poetics Today},
author = {Rachel Giora},
year = {1985},
keywords = {Analyse de texte},
pages = {699--715} },
-
T. Reinhart, "Pragmatics and linguistics. An analysis of sentence topics," Philosophica, vol. 27, pp. 53-93, 1981.
@article{reinhart_pragmatics_1981, title = {Pragmatics and linguistics. An analysis of sentence topics},
volume = {27},
journal = {Philosophica},
author = {Tanya Reinhart},
year = {1981},
keywords = {Analyse de texte, Linguistique},
pages = {53--93} },
-
W. Kintsch and T. V. A. Dijk, "Toward a model of text comprehension and production," Psychological Review, vol. 85, iss. 5, pp. 363-394, 1978.
@article{kintsch_towardmodel_1978, title = {Toward a model of text comprehension and production},
volume = {85},
abstract = {Described is the system of mental operations occurring in text comprehension and in recall and summarization. A processing model is outlined: 1) the meaning elements of a text become organized into a coherent whole, 2) the full meaning of the text is condensed into its gist, and 3) new texts are generated from the comprehension processes.},
number = {5},
journal = {Psychological Review},
author = {Walter Kintsch and Teun A. Van Dijk},
year = {1978},
keywords = {Analyse de texte},
pages = {363--394} },
-
J. Hamesse, "L’informatique et l’analyse des textes," Revue Philosophique de Louvain, vol. 75, p. 251, 1977.
@article{hamesse_linformatique_1977, title = {L’informatique et l’analyse des textes},
volume = {75},
issn = {00353841},
journal = {Revue Philosophique de Louvain},
author = {Jacqueline Hamesse},
year = {1977},
keywords = {Analyse de texte, Informatique},
pages = {251–266} },
-
V. Propp, Morphology of the folktale, Austin: University of Texas Press, 1928.
@book{propp_morphology_1928, address = {Austin},
title = {Morphology of the folktale},
publisher = {University of Texas Press},
author = {Vladimir Propp},
year = {1928},
keywords = {Analyse de texte, Philosophie} },
-
T. Kalledat, Considering changes in semantic representations of texts.
@misc{kalledat_considering_????, title = {Considering changes in semantic representations of texts},
author = {Thobias Kalledat},
keywords = {Analyse de texte, Linguistique},
annote = {{{\textless}p{\textgreater}kalledatThobias.pdf{\textless}/p{\textgreater}}} },
-
H. Liu and H. Motoda, Computational methods of feature selection, Boca Raton: Chapman \& Hall/CRC, 2008.
@book{liu_computational_2008, address = {Boca Raton},
series = {Chapman \& {Hall/CRC} data mining and knowledge discovery series},
title = {Computational methods of feature selection},
isbn = {9781584888789},
publisher = {Chapman \& {Hall/CRC}},
author = {Huan Liu and Hiroshi Motoda},
year = {2008},
keywords = {Apprentissage machine} },
-
C. Köse, &. Özyurt, and C. İkibaş, "A comparison of textual data mining methods for sex identification in Chat conversations," in Information retrieval technology : 4th Asia infomation retrieval symposium, AIRS 2008, Harbin, China, january 15-18, 2008 : revised selected papers, Berlin; Heidelberg, 2008, pp. 638-643.
@inproceedings{kse_comparison_2008, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4993},
title = {A comparison of textual data mining methods for sex identification in Chat conversations},
url = {http://dx.doi.org/10.1007/978-3-540-68636-1_76},
abstract = {Mining textual data in chat mediums is becoming more important because these mediums contain a vast amount of information, which is potentially relevant to a society’s current interests, habits, social behaviors, crime tendency and other tendencies. Here, sex identification is taken as a base study in information mining in chat mediums. In order to do this, a simple discrimination function and semantic analysis method are proposed for sex identification in Turkish chat mediums. Then, the proposed sex identification method is compared with the Support Vector Machine {(SVM)} and Naive Bayes {(NB)} methods. Finally, results show that the proposed system has achieved accuracy over 90\% in sex identification.},
booktitle = {Information retrieval technology : 4th Asia infomation retrieval symposium, {AIRS} 2008, Harbin, China, january 15-18, 2008 : revised selected papers},
publisher = {Springer},
author = {Cemal Köse and Özcan Özyurt and Cevat İkibaş},
year = {2008},
keywords = {Apprentissage machine, Fouille de texte, Recherche d'information},
pages = {638--643} },
-
L. Zhou, Y. Shi, and D. Zhang, "A statistical language modeling approach to online deception detection," Knowledge and Data Engineering, IEEE Transactions on, vol. 20, iss. 8, pp. 1077-1081, 2008.
@article{zhou_statistical_2008, title = {A statistical language modeling approach to online deception detection},
volume = {20},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/4553782/04358936.pdf?tp=&isnumber=4553782&arnumber=4358936&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2007.190624},
abstract = {Online deception is disrupting our daily life, organizational process, and even national security. Existing approaches to online deception detection follow a traditional paradigm by using a set of cues as antecedents for deception detection, which may be hindered by ineffective cue identification. Motivated by the strength of statistical language models {(SLMs)} in capturing the dependency of words in text without explicit feature extraction, we developed {SLMs} to detect online deception. We also addressed the data sparsity problem in building {SLMs} in general and in deception detection in specific using smoothing and vocabulary pruning techniques. The developed {SLMs} were evaluated empirically with diverse datasets. The results showed that the proposed {SLM} approach to deception detection outperformed a state-of-the-art text categorization method as well as traditional feature-based methods.},
number = {8},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Lina Zhou and Yongmei Shi and Dongsong Zhang},
year = {2008},
keywords = {Apprentissage machine, Approche statistique},
pages = {1077--1081},
annote = {{{\textless}p{\textgreater}zhouLina2008.pdf{\textless}/p{\textgreater}}} },
-
M. Torii and K. Vijay-Shanker, "Sortal anaphora resolution in Medline abstracts," Computational Intelligence, vol. 23, pp. 15-27, 2007.
@article{torii_sortal_2007, title = {Sortal anaphora resolution in Medline abstracts},
volume = {23},
url = {http://www.ingentaconnect.com/content/bpl/coin/2007/00000023/00000001/art00003},
abstract = {This paper reports our investigation of machine learning methods applied to anaphora resolution for biology texts, particularly paper abstracts. Our primary concern is the investigation of features and their combinations for effective anaphora resolution. In this paper, we focus on the resolution of demonstrative phrases and definite determiner phrases, the two most prevalent forms of anaphoric expressions that we find in biology research articles. Different resolution models are developed for demonstrative and definite determiner phrases. Our work shows that models may be optimized differently for each of the phrase types. Also, because a significant number of definite determiner phrases are not anaphoric, we induce a model to detect anaphoricity, i.e., a model that classifies phrases as either anaphoric or nonanaphoric. We propose several novel features that we call highlighting features, and consider their utility particularly for processing paper abstracts. The system using the highlighting features achieved accuracies of 78\% and 71\% for demonstrative phrases and definite determiner phrases, respectively. The use of the highlighting features reduced the error rate by about 10\%.},
journal = {Computational Intelligence},
author = {Manabu Torii and K. {Vijay-Shanker}},
year = {2007},
keywords = {Apprentissage machine, Bio informatic, Langage naturel},
pages = {15--27} },
-
A. Juárez-González, A. Téllez-Valero, C. Denicia-Carral, M. Montes-y-Gómez, and L. Villaseñor-Pineda, "Using machine learning and text mining in question answering," in Evaluation of multilingual and multi-modal information retrieval : 7th workshop of the cross-language evaluation forum, CLEF 2006, Alicante, Spain, september 20-22, 2006 : revised selected papers, Berlin; Heidelberg, 2007, p. 415-423.
@inproceedings{jurez-gonzlez_using_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4730},
title = {Using machine learning and text mining in question answering},
url = {http://dx.doi.org/10.1007/978-3-540-74999-8_49},
abstract = {This paper describes a {QA} system centered in a full data-driven architecture. It applies machine learning and text mining techniques to identify the most probable answers to factoid and definition questions respectively. Its major quality is that it mainly relies on the use of lexical information and avoids applying any complex language processing resources such as named entity classifiers, parsers and ontologies. Experimental results on the Spanish Question Answering task at {CLEF} 2006 show that the proposed architecture can be a practical solution for monolingual question answering by reaching a precision as high as 51\%.},
booktitle = {Evaluation of multilingual and multi-modal information retrieval : 7th workshop of the cross-language evaluation forum, {CLEF} 2006, Alicante, Spain, september 20-22, 2006 : revised selected papers},
publisher = {Springer},
author = {Antonio {Juárez-González} and Alberto {Téllez-Valero} and Claudia {Denicia-Carral} and Manuel {Montes-y-Gómez} and Luis {Villaseñor-Pineda}},
year = {2007},
keywords = {Apprentissage machine, Fouille de texte},
pages = {415--423}
-
R. Wille, "Methods of conceptual knowledge processing." Springer, 2006, pp. 1-29.
@incollection{wille_methods_2006, series = {Lecture notes in computer science; 3874},
title = {Methods of conceptual knowledge processing},
isbn = {978-3-540-32203-0},
abstract = {The offered methods of Conceptual Knowledge Processing are procedures which are well-planed to mean and purpose and therewith lead to skills for solving practical tasks. The used means and skills have been mainly created as translations of mathematical means and skills of Formal Concept Analysis. Those transdisciplinary translations may be understood as transformations from mathematical thinking, dealing with potential realities, to logical thinking, dealing with actual realities. Each of the 38 presented methods is discussed in a general language of logical nature, while citations give links to the underlying mathematical background. Applications of the methods are demonstrated by concrete examples mostly taken from the literature to which explicit references are given.},
booktitle = {Formal concept analysis},
publisher = {Springer},
author = {Rudolf Wille},
year = {2006},
keywords = {Apprentissage machine, Science cognitive},
pages = {1--29},
annote = {{{\textless}p{\textgreater}willeRudolf2006.pdf{\textless}/p{\textgreater}}} },
-
A. Christy and P. Thambidurai, "Efficient information extraction using machine learning and classification using genetic and C4.8 algorithms," Information Technology Journal, vol. 5, iss. 6, pp. 1023-7, 2006.
@article{christy_efficient_2006, title = {Efficient information extraction using machine learning and classification using genetic and C4.8 algorithms},
volume = {5},
abstract = {With the amount of information available on the Internet growing at phenomenal rate, research in improving the effectiveness and efficiency of information extraction and knowledge discovery has become crucial. Text mining is one of the most important ways of extracting meaningful information from a large collection of text documents, leaving aside the information which is not useful to the ordinary user. In this study, we propose a method for automatically extracting key elements from a collection of text documents by extracting a set of features using a machine learning technique. We have used the genetic algorithms for classifying the features those are selected by the machine learning technique. We also compared the results produced by the genetic algorithm with 10 folds cross-validation at C4.8, Rain Forest, Raintree and {NB} tree methods and we have found C4.8 has produced better precision and recall and also the genetic algorithm is an effective classifier and is quite competitive even though the concept increases in complexity},
number = {6},
journal = {Information Technology Journal},
author = {A. Christy and P. Thambidurai},
year = {2006},
keywords = {Apprentissage machine, Fouille de donnée},
pages = {1023--7},
annote = {{{\textless}p{\textgreater}Copyright} 2006, The Institution of Engineering and Technology 9193959 information extraction machine learning genetic algorithm C4.8 algorithm Internet knowledge discovery text mining Rain Forest Raintree {NB} tree parsing feature set extraction{\textless}/p{\textgreater}} },
-
H. Al-mubaid and S. A. Umair, "A new text categorization technique using distributional clustering and learning logic," IEEE Transaction on Knowledge and Data Engineering, vol. 18, iss. 9, 2006.
@article{al-mubaid_new_2006, title = {A new text categorization technique using distributional clustering and learning logic},
volume = {18},
url = {http://citeseer.ist.psu.edu/761086.html},
abstract = {Text categorization is continuing to be one of the most researched {NLP} problems due to the ever-increasing amounts of electronic documents and digital libraries. In this paper, we present a new text categorization method that combines the distributional clustering of words and a learning logic technique, called Lsquare, for constructing text classifiers. The high dimensionality of text in a document has not been fruitful for the task of categorization, for which reason, feature clustering has been proven to be an ideal alternative to feature selection for reducing the dimensionality. We, therefore, use distributional clustering method {(IB)} to generate an efficient representation of documents and apply Lsquare for training text classifiers. The method was extensively tested and evaluated. The proposed method achieves higher or comparable classification accuracy and F1 results compared with {SVM} on exact experimental settings with a small number of training documents on three benchmark data sets {WebKB,} {20Newsgroup,} and Reuters-21578. The results prove that the method is a good choice for applications with a limited amount of labeled training data. We also demonstrate the effect of changing training size on the classification performance of the learners.},
number = {9},
journal = {{IEEE} Transaction on Knowledge and Data Engineering},
author = {Hisham Al-mubaid and Syed A. Umair},
year = {2006},
keywords = {Apprentissage machine, Catégorisation},
annote = {{{\textless}p{\textgreater}al-mubaidHisham2006.pdf{\textless}/p{\textgreater}}} },
-
V. Claveau and . L’Homme, "Structuring terminology using analogy-based machine learning," in Proceedings of the 7th international conference on terminology and knowledge engineering, TKE 2005, Copenhagen (Denmark), 17-18 august 2005, 2005.
@inproceedings{claveau_structuring_2005, title = {Structuring terminology using analogy-based machine learning},
url = {www.olst.umontreal.ca/pdf/Claveau-LHomme-tke05.pdf},
booktitle = {Proceedings of the 7th international conference on terminology and knowledge engineering, {TKE} 2005, Copenhagen {(Denmark),} 17-18 august 2005},
author = {Vincent Claveau and {Marie-Claude} {L'Homme}},
year = {2005},
keywords = {Apprentissage machine, Terminologie},
annote = {{{\textless}p{\textgreater}claveauVincent2005\_1.pdf{\textless}/p{\textgreater}}} },
-
P. Cimiano and J. Volker, "Text2Onto : a framework for ontology learning and data-driven change discovery," in Natural language processing and information systems : 10th international conference on applications of natural language to information systems, NLDB 2005, Alicante, Spain, june 15-17 : proceedings, Berlin; Heidelberg, 2005, pp. 227-238.
@inproceedings{cimiano_text2onto_2005, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 3513},
title = {{Text2Onto} : a framework for ontology learning and data-driven change discovery},
isbn = {0302-9743},
abstract = {In this paper we present Text2onto, a framework for ontology learning from textual resources. Three main features distinguish Text20nto from our earlier framework {TextToOnto} as well as other state-of-the-art ontology learning frameworks. First, by representing the learned knowledge at a meta-level in the form of instantiated modeling primitives within a so called Probabilistic Ontology Model {(POM),} we remain independent of a concrete target language while being able to translate the instantiated primitives into any (reasonably expressive) knowledge representation formalism. Second, user interaction is a core aspect of Text20nto and the fact that the system calculates a confidence for each learned object allows to design sophisticated visualizations of the {POM.} Third, by incorporating strategies for data-driven change discovery, we avoid processing the whole corpus from scratch each time it changes, only selectively updating the {POM} according to the corpus changes instead. Besides increasing efficiency in this way, it also allows a user to trace the evolution of the ontology with respect to the changes in the underlying corpus.},
booktitle = {Natural language processing and information systems : 10th international conference on applications of natural language to information systems, {NLDB} 2005, Alicante, Spain, june 15-17 : proceedings},
publisher = {Springer},
author = {P. Cimiano and J. Volker},
year = {2005},
keywords = {Apprentissage machine, Découverte de connaissances, Ontologie},
pages = {227--238} },
-
V. Claveau and . L’Homme, "Apprentissage par analogie pour la structuration de terminologie : utilisation comparée de ressources endogènes et exogènes," in Terminologie et intelligence artificielle, TIA 2005. Actes, Université de Rouen, Rouen (France), 4-5 avril 2005, 2005.
@inproceedings{claveau_apprentissage_2005, title = {Apprentissage par analogie pour la structuration de terminologie : utilisation comparée de ressources endogènes et exogènes},
abstract = {Cet article présente une méthode originale pour détecter en corpus spécialisé des couples de termes morphologiquement liés et prédire le lien sémantique qui les unit dans le domaine étudié. Ces liens sémantiques, modélisés à l’aide de fonctions lexicales, permettent ainsi de structurer une terminologie du domaine. La méthode exposée repose sur une technique d’apprentissage artificiel par analogie qui permet de confronter efficacement des couples de mots inconnus à des exemples de couples de termes dont le lien sémantique est connu. Elle tire également parti d’un système d’extraction de termes qui permet d’éviter la détection de liens non pertinents dans le domaine. Cette approche est évaluée dans le domaine de l’informatique ; les résultats montrent que l’approche simple que nous proposons est très performante. Deux expériences sont notamment menées, l’une utilisant des exemples issus du domaine, l’autre, des exemples tirés d’une base généraliste. La comparaison des résultats de chacune d’elles permet ainsi d’évaluer quantitativement l’intérêt de telles ressources généralistes pour ce type de tâches et apporte ainsi une contribution chiffrée au débat opposant l’utilisation ressources endogènes et exogènes en terminologie computationnelle.},
booktitle = {Terminologie et intelligence artificielle, {TIA} 2005. Actes, Université de Rouen, Rouen {(France),} 4-5 avril 2005},
author = {Vincent Claveau and {Marie-Claude} {L'Homme}},
year = {2005},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}claveauVincent2005.pdf{\textless}/p{\textgreater}}} },
-
D. Zhang and W. S. Lee, "Learning to integrate Web taxonomies," Journal of Web semantics, vol. 2, iss. 2, 2005.
@article{zhang_learning_2005, title = {Learning to integrate Web taxonomies},
volume = {2},
url = {www.comp.nus.edu.sg/~leews/publications/dellzhang_ws2004.pdf},
abstract = {We investigate machine learning methods for automatically integrating objects from different taxonomies into a master taxonomy. This problem is not only currently pervasive on the Web, but is also important to the emerging Semantic Web. A straightforward approach to automating this process would be to build classifiers through machine learning and then use these classifiers to classify objects from the source taxonomies into categories of the master taxonomy. However, conventional machine learning algorithms totally ignore the availability of the source taxonomies. In fact, source and master taxonomies often have common categories under different names or other more complex semantic overlaps. We introduce two techniques that exploit the semantic overlap between the source and master taxonomies to build better classifiers for the master taxonomy. The first technique, Cluster Shrinkage, biases the learning algorithm against splitting source categories by making objects in the same category appear more similar to each other. The second technique, {Co-Bootstrapping,} tries to facilitate the exploitation of inter-taxonomy relationships by providing category indicator functions as additional features for the objects. Our experiments with real-world Web data show that these proposed add-on techniques can enhance various machine learning algorithms to achieve substantial improvements in performance for taxonomy integration.},
number = {2},
journal = {Journal of Web semantics},
author = {Dell Zhang and Wee Sun Lee},
year = {2005},
keywords = {Apprentissage machine, Classification, Ontologie, Taxonomie, Web sémantique},
annote = {{{\textless}p{\textgreater}zhangDell2005.pdf{\textless}/p{\textgreater}}} },
-
S. Pradhan, K. Hacioglu, V. Krugler, W. Ward, J. Martin, and D. Jurafsky, "Support vector learning for semantic argument classification," Machine Learning, vol. 60, pp. 11-39, 2005.
@article{pradhan_support_2005, title = {Support vector learning for semantic argument classification},
volume = {60},
url = {http://www.ingentaconnect.com/content/klu/ml/2005/00000060/F0030001/00000912},
abstract = {The natural language processing community has recently experienced a growth of interest in domain independent shallow semantic parsing—the process of assigning a Who did What to Whom, When, Where, Why, How etc. structure to plain text. This process entails identifying groups of words in a sentence that represent these semantic arguments and assigning specific labels to them. It could play a key role in {NLP} tasks like Information Extraction, Question Answering and Summarization. We propose a machine learning algorithm for semantic role parsing, extending the work of Gildea and Jurafsky (2002), Surdeanu et al. (2003) and others. Our algorithm is based on Support Vector Machines which we show give large improvement in performance over earlier classifiers. We show performance improvements through a number of new features designed to improve generalization to unseen data, such as automatic clustering of verbs. We also report on various analytic studies examining which features are most important, comparing our classifier to other machine learning algorithms in the literature, and testing its generalization to new test set from different genre. On the task of assigning semantic labels to the {PropBank} {(Kingsbury,} Palmer, \& Marcus, 2002) corpus, our final system has a precision of 84\% and a recall of 75\%, which are the best results currently reported for this task. Finally, we explore a completely different architecture which does not requires a deep syntactic parse. We reformulate the task as a combined chunking and classification problem, thus allowing our algorithm to be applied to new languages or genres of text for which statistical syntactic parsers may not be available.},
journal = {Machine Learning},
author = {Sameer Pradhan and Kadri Hacioglu and Valerie Krugler and Wayne Ward and James Martin and Daniel Jurafsky},
year = {2005},
keywords = {Apprentissage machine, Classification},
pages = {11--39} },
-
N. Ireson, F. Ciravegna, M. E. Califf, D. Freitag, N. Kushmerick, and A. Lavelli, "Evaluating machine learning for information extraction," ACM International Conference Proceeding Series, vol. 119, pp. 345-352, 2005.
@article{ireson_evaluating_2005, title = {Evaluating machine learning for information extraction},
volume = {119},
journal = {{ACM} International Conference Proceeding Series},
author = {N. Ireson and F. Ciravegna and M. E. Califf and D. Freitag and N. Kushmerick and A. Lavelli},
year = {2005},
keywords = {Apprentissage machine, Extraction d'information},
pages = {345--352} },
-
D. Zhang and W. S. Lee, "Learning to integrate Web taxonomies," Web Semantics: Science, Services and Agents on the World Wide Web, vol. 2, iss. 2, pp. 131-151, 2004.
@article{zhang_learning_2004, title = {Learning to integrate Web taxonomies},
volume = {2},
url = {http://www.sciencedirect.com/science/article/B758F-4DS962C-1/2/8f2f5cedab10f505a7b404e03cc0788f},
abstract = {We investigate machine learning methods for automatically integrating objects from different taxonomies into a master taxonomy. This problem is not only currently pervasive on the Web, but is also important to the emerging Semantic Web. A straightforward approach to automating this process would be to build classifiers through machine learning and then use these classifiers to classify objects from the source taxonomies into categories of the master taxonomy. However, conventional machine learning algorithms totally ignore the availability of the source taxonomies. In fact, source and master taxonomies often have common categories under different names or other more complex semantic overlaps. We introduce two techniques that exploit the semantic overlap between the source and master taxonomies to build better classifiers for the master taxonomy. The first technique, Cluster Shrinkage, biases the learning algorithm against splitting source categories by making objects in the same category appear more similar to each other. The second technique, {Co-Bootstrapping,} tries to facilitate the exploitation of inter-taxonomy relationships by providing category indicator functions as additional features for the objects. Our experiments with real-world Web data show that these proposed add-on techniques can enhance various machine learning algorithms to achieve substantial improvements in performance for taxonomy integration.},
number = {2},
journal = {Web Semantics: Science, Services and Agents on the World Wide Web},
author = {Dell Zhang and Wee Sun Lee},
year = {2004},
keywords = {Apprentissage machine, Classification, Ontologie, Taxonomie, Web sémantique},
pages = {131--151} },
-
. Torres-Moreno, R. Vazquez-Pérez, P. Belot, M. El-Bèze, . St-Onge, and M. Gagnon, Coupling an automatic-summarization system with a question-answering system (qaas), 2004.
@misc{torres-moreno_couplingautomatic-summarization_2004, title = {Coupling an automatic-summarization system with a question-answering system (qaas)},
author = {{Juan-Manuel} {Torres-Moreno} and Rafael {Vazquez-Pérez} and Patrice Belot and Marc {El-Bèze} and {Pier-Luc} {St-Onge} and Michel Gagnon},
month = oct, year = {2004},
keywords = {Apprentissage machine, Fouille de texte, Recherche d'information},
annote = {{{\textless}p{\textgreater}torres-morenoJuan-manuel2004{\textless}/p{\textgreater}}} },
-
CS 478 Machine learning : measuring accuracy, 2004.
@misc{_cs_2004, type = {{PowerPoint}},
title = {{CS} 478 Machine learning : measuring accuracy},
year = {2004},
keywords = {Apprentissage machine} },
-
D. J. C. . MacKay, Information theory, inference, and learning algorithms, Version 7.0 ed., Cambridge University Press, 2004.
@book{mackay_information_2004, edition = {Version 7.0},
title = {Information theory, inference, and learning algorithms},
url = {http://www.cs.toronto.edu/~mackay/itprnn/book.pdf},
publisher = {Cambridge University Press},
author = {David {J.C.} {MacKay}},
year = {2004},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}mackayDavid2004.pdf{\textless}/p{\textgreater}}} },
-
J. Demsar, B. Zupan, and G. Leban, Orange : from experimental machine learning to interactive data mining, 2004.
@misc{demsar_orange_2004, title = {Orange : from experimental machine learning to interactive data mining},
url = {http://magix.fri.uni-lj.si/orange/},
journal = {University of Ljubljana},
author = {J. Demsar and B. Zupan and G. Leban},
year = {2004},
keywords = {Apprentissage machine, Fouille de donnée},
howpublished = {http://magix.fri.uni-lj.si/orange/} },
-
E. Alpaydin, Introduction to machine learning, Cambridge, Mass.: MIT Press, 2004.
@book{alpaydin_introduction_2004, address = {Cambridge, Mass.},
series = {Adaptive computation and machine learning},
title = {Introduction to machine learning},
isbn = {0262012111},
abstract = {The goal of machine learning is to program computers to use example data or past experience to solve a given problem. Many successful applications of machine learning exist already, including systems that analyze past sales data to predict customer behavior, recognize faces or spoken speech, optimize robot behavior so that a task can be completed using minimum resources, and extract knowledge from bioinformatics data. Introduction to Machine Learning is a comprehensive textbook on the subject, covering a broad array of topics not usually included in introductory machine learning texts. It discusses many methods based in different fields, including statistics, pattern recognition, neural networks, artificial intelligence, signal processing, control, and data mining, in order to present a unified treatment of machine learning problems and solutions. All learning algorithms are explained so that the student can easily move from the equations in the book to a computer program. The book can be used by advanced undergraduates and graduate students who have completed courses in computer programming, probability, calculus, and linear algebra. It will also be of interest to engineers in the field who are concerned with the application of machine learning methods. After an introduction that defines machine learning and gives examples of machine learning applications, the book covers supervised learning, Bayesian decision theory, parametric methods, multivariate methods, dimensionality reduction, clustering, nonparametric methods, decision trees, linear discrimination, multilayer perceptrons, local models, hidden Markov models, assessing and comparing classification algorithms, combining multiple learners, and reinforcement learning.},
publisher = {{MIT} Press},
author = {Ethem Alpaydin},
year = {2004},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}TOC} : 1. Introduction -- 2. Supervised learning -- 3. Bayesian decision theory -- 4. Parametric methods -- 5. Multivariate methods -- 6. Dimensionality reduction -- 7. Clustering -- 8. Nonparametric methods -- 9. Decision trees -- 10. Linear discrimination -- 11. Multilayer perceptrons -- 12. Local models -- 13. Hidden Markov models -- 14. Assessing and comparing classification algorithms -- 15. Combining multiple learners -- 16. Reinforcement learning -- A. Probability.{\textless}/p{\textgreater}} },
-
L. Baoli, L. Qin, and Y. Shiwen, "An adaptive k-nearest neighbor text categorization strategy," ACM Transactions on Asian Language Information Processing, vol. 3, iss. 4, pp. 215-226, 2004.
@article{baoli_adaptive_2004, title = {An adaptive k-nearest neighbor text categorization strategy},
volume = {3},
issn = {15300226},
url = {http://portal.acm.org/ft_gateway.cfm?id=1039623&type=pdf&coll=GUIDE&dl=GUIDE&CFID=12636072&CFTOKEN=79254546},
doi = {10.1145/1039621.1039623},
abstract = {k is the most important parameter in a text categorization system based on the k-nearest neighbor algorithm {(kNN).} To classify a new document, the k-nearest documents in the training set are determined first. The prediction of categories for this document can then be made according to the category distribution among the k nearest neighbors. Generally speaking, the class distribution in a training set is not even; some classes may have more samples than others. The system's performance is very sensitive to the choice of the parameter k. And it is very likely that a fixed k value will result in a bias for large categories, and will not make full use of the information in the training set. To deal with these problems, an improved {kNN} strategy, in which different numbers of nearest neighbors for different categories are used instead of a fixed number across all categories, is proposed in this article. More samples (nearest neighbors) will be used to decide whether a test document should be classified in a category that has more samples in the training set. The numbers of nearest neighbors selected for different categories are adaptive to their sample size in the training set. Experiments on two different datasets show that our methods are less sensitive to the parameter k than the traditional ones, and can properly classify documents belonging to smaller classes with a large k. The strategy is especially applicable and promising for cases where estimating the parameter k via cross-validation is not possible and the class distribution of a training set is skewed.},
number = {4},
journal = {{ACM} Transactions on Asian Language Information Processing},
author = {Li Baoli and Lu Qin and Yu Shiwen},
year = {2004},
keywords = {Apprentissage machine, Catégorisation},
pages = {215--226},
annote = {{{\textless}p{\textgreater}baoliLi2004.pdf{\textless}/p{\textgreater}}} },
-
A. Basu, C. R. Watters, and M. Shepherd, "Support vector machines for text categorization," in Proceedings of the 36th annual Hawaii international conference on system sciences (HICSS’03), Hawai, United States, 2003, p. 103.
@inproceedings{basu_support_2003, address = {Hawai, United States},
title = {Support vector machines for text categorization},
volume = {4 - track 4},
isbn = {0-7695-1874-5},
url = {http://ieeexplore.ieee.org/iel5/8360/26341/01174243.pdf?arnumber=1174243},
abstract = {Text categorization is the process of sorting text documents into one or more predefined categories or classes of similar documents. Differences in the results of such categorization arise from the feature set chosen to base the association of a given document with a given category. Advocates of text categorization recognize that the sorting of text documents into categories of like documents reduces the overhead required for fast retrieval of such documents and provides smaller domains in which the users may explore similar documents. In this paper we are interested in examining whether automatic classification of news texts can be improved by a prefiltering the vocabulary to reduce the feature set used in the computations. First we compare artificial neural network and support vector machine algorithms for use as text classifiers of news items. Secondly, we identify a reduction in feature set that provides improved results.},
booktitle = {Proceedings of the 36th annual Hawaii international conference on system sciences {(HICSS'03)}},
author = {A. Basu and Carolyn R. Watters and M. Shepherd},
year = {2003},
keywords = {Apprentissage machine},
pages = {103.3},
annote = {{{\textless}p{\textgreater}basuA2003.pdf{\textless}/p{\textgreater}}} },
-
J. Carbo and A. Ledezma, "A machine learning based evaluation of a negotiation between agents involving fuzzy counter-offers," in Advances in Web intelligence : first international Atlantic Web intelligence conference, AWIC 2003, Madrid, Spain, may 2003 : proceedings, New York, 2003, pp. 268-277.
@inproceedings{carbo_machine_2003, address = {New York},
series = {Lecture notes in computer science; 2663},
title = {A machine learning based evaluation of a negotiation between agents involving fuzzy counter-offers},
abstract = {Negotiation plays a fundamental role in systems composed of multiple autonomous agents. Some negotiations may require a more elaborated dialogue where agents would explain offer rejections in a general and vague way. We propose that agents would represent their disappointment about an offer through a fuzzy set applied to each attribute of the offer. Fuzziness can also be very useful in order to make user profiles more difficult to acquire. The satisfaction of this intention is evaluated using classification techniques to compare the accuracy of the models that were obtained from the observation of the behaviour of the agents. In order to test how much information may be extracted about the internal preferences of agents, the task of modeling is translated into a classification task solved by a technique that would generate symbolic representations, such as m5.},
booktitle = {Advances in Web intelligence : first international Atlantic Web intelligence conference, {AWIC} 2003, Madrid, Spain, may 2003 : proceedings},
publisher = {Springer},
author = {Javier Carbo and Agapito Ledezma},
year = {2003},
keywords = {Apprentissage machine, Fuzzy},
pages = {268--277},
annote = {{{\textless}p{\textgreater}carboJavier2003.pdf{\textless}/p{\textgreater}}} },
-
J. He, Ah-Hwee, and Chew-Lim, "On machine learning methods for chinese document categorization," Applied intelligence, vol. 18, iss. 3, pp. 311-322, 2003.
@article{he_machine_2003, title = {On machine learning methods for chinese document categorization},
volume = {18},
issn = {{0924-669X} {(Print)} 1573-7497 {(Online)}},
url = {http://www.springerlink.com/content/l420103859307306/fulltext.pdf},
doi = {10.1023/A:1023202221875},
abstract = {This paper reports our comparative evaluation of three machine learning methods, namely k Nearest Neighbor {(kNN),} Support Vector Machines {(SVM),} and Adaptive Resonance Associative Map {(ARAM)} for Chinese document categorization. Based on two Chinese corpora, a series of controlled experiments evaluated their learning capabilities and efficiency in mining text classification knowledge. Benchmark experiments showed that their predictive performance were roughly comparable, especially on clean and well organized data sets. While {kNN} and {ARAM} yield better performances than {SVM} on small and clean data sets, {SVM} and {ARAM} significantly outperformed {kNN} on noisy data. Comparing efficiency, {kNN} was notably more costly in terms of time and memory than the other two methods. {SVM} is highly efficient in learning from well organized samples of moderate size, although on relatively large and noisy data the efficiency of {SVM} and {ARAM} are comparable.},
number = {3},
journal = {Applied intelligence},
author = {Ji He and {Ah-Hwee} Tan and {Chew-Lim} Tan},
month = may, year = {2003},
keywords = {Apprentissage machine, Catégorisation},
pages = {311--322},
annote = {{{\textless}p{\textgreater}heJi2003.pdf{\textless}/p{\textgreater}}} },
-
G. Forman, "An extensive empirical study of feature selection metrics for text classification," Journal of machine learning research, vol. 3, pp. 1289-1305, 2003.
@article{forman_extensive_2003, title = {An extensive empirical study of feature selection metrics for text classification},
volume = {3},
issn = {1533-7928},
url = {http://portal.acm.org/citation.cfm?id=944919.944974&coll=GUIDE&dl=GUIDE},
abstract = {Machine learning for text classification is the cornerstone of document categorization, news filtering, document routing, and personalization. In text domains, effective feature selection is essential to make the learning task efficient and more accurate. This paper presents an empirical comparison of twelve feature selection methods (e.g. Information Gain) evaluated on a benchmark of 229 text classification problem instances that were gathered from Reuters, {TREC,} {OHSUMED,} etc. The results are analyzed from multiple goal perspectives-accuracy, F-measure, precision, and recall-since each is appropriate in different situations. The results reveal that a new feature selection metric we call {'Bi-Normal} Separation' {(BNS),} outperformed the others by a substantial margin in most situations. This margin widened in tasks with high class skew, which is rampant in text classification problems and is particularly challenging for induction algorithms. A new evaluation methodology is offered that focuses on the needs of the data mining practitioner faced with a single dataset who seeks to choose one (or a pair of) metrics that are most likely to yield the best performance. From this perspective, {BNS} was the top single choice for all goals except precision, for which Information Gain yielded the best result most often. This analysis also revealed, for example, that Information Gain and {Chi-Squared} have correlated failures, and so they work poorly together. When choosing optimal pairs of metrics for each of the four performance goals, {BNS} is consistently a member of the pair---e.g., for greatest recall, the pair {BNS} + F1-measure yielded the best performance on the greatest number of tasks by a considerable margin.},
journal = {Journal of machine learning research},
author = {George Forman},
year = {2003},
keywords = {Apprentissage machine},
pages = {1289--1305},
annote = {{{\textless}p{\textgreater}formanGeorge2003.pdf{\textless}/p{\textgreater}}} },
-
S. Tong and D. Koller, "Support vector machine active learning with applications to text classification," Journal of Machine Learning Research, vol. 2, pp. 45-66, 2002.
@article{tong_support_2002, title = {Support vector machine active learning with applications to text classification},
volume = {2},
issn = {15337928},
url = {http://portal.acm.org/ft_gateway.cfm?id=944793&type=pdf&coll=GUIDE&dl=GUIDE,&CFID=7300027&CFTOKEN=72434572},
abstract = {Support vector machines have met with significant success in numerous real-world learning tasks. However, like most machine learning algorithms, they are generally applied using a randomly selected training set classified in advance. In many settings, we also have the option of using {\textless}em{\textgreater}pool-based active learning{\textless}/em{\textgreater}. Instead of using a randomly selected training set, the learner has access to a pool of unlabeled instances and can request the labels for some number of them. We introduce a new algorithm for performing active learning with support vector machines, i.e., an algorithm for choosing which instances to request next. We provide a theoretical motivation for the algorithm using the notion of a {\textless}em{\textgreater}version space{\textless}/em{\textgreater}. We present experimental results showing that employing our active learning method can significantly reduce the need for labeled training instances in both the standard inductive and transductive settings.},
journal = {Journal of Machine Learning Research},
author = {Simon Tong and Daphne Koller},
year = {2002},
keywords = {Apprentissage machine, Classification},
pages = {45--66},
annote = {{{\textless}p{\textgreater}tongSimon2002.pdf{\textless}/p{\textgreater}}} },
-
T. Joachims, Learning to classify text using support vector machines : methods, theory, and algorithms, Boston: Kluwer Academic Publishers, 2002.
@book{joachims_learning_2002, address = {Boston},
series = {The Springer International Series in engineering and computer science; 668},
title = {Learning to classify text using support vector machines : methods, theory, and algorithms},
isbn = {{079237679X} {(ALK.} {PAPER)}},
abstract = {Text Classification, or the task of automatically assigning semantic categories to natural language text, has become one of the key methods for organizing online information. Since hand-coding classification rules is costly or even impractical, most modern approaches employ machine learning techniques to automatically learn text classifiers from examples. However, none of these conventional approaches combines good prediction performance, theoretical understanding, and efficient training algorithms. Based on ideas from Support Vector Machines {(SVMs),} Learning To Classify Text Using Support Vector Machines presents a new approach to generating text classifiers from examples. The approach combines high performance and efficiency with theoretical understanding and improved robustness. In particular, it is highly effective without greedy heuristic components. The {SVM} approach is computationally efficient in training and classification, and it comes with a learning theory that can guide real-world applications. Learning To Classify Text Using Support Vector Machines gives a complete and detailed description of the {SVM} approach to learning text classifiers, including training algorithms, transductive text classification, efficient performance estimation, and a statistical learning model of text classification. In addition, it includes an overview of the field of text classification, making it self-contained even for newcomers to the field. This book gives a concise introduction to {SVMs} for pattern recognition, and it includes a detailed description of how to formulate text-classification tasks for machine learning. Learning To Classify Text Using Support Vector Machines is designed as a reference for researchers and practitioners, and is suitable as a secondary text for graduate-level students in Computer Science within Machine Learning and Language Technology.},
publisher = {Kluwer Academic Publishers},
author = {Thorsten Joachims},
year = {2002},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}Originally} presented as the author\'s thesis {(doctoral--Universität} Dortmund) under the title: {\"The} maximum-margin approach to learning text classifiers--methods, theory, and algorithms,\" 2001. {\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}TOC} : Foreword / 1. Introduction -- 2. Text Classification -- 3. Support Vector Machines -- 4. A Statistical Learning Model of Text Classification for {SVMS} -- 5. Efficient Performance Estimators for {SVMS} -- 6. Inductive Text Classification -- 7. Transductive Text Classification -- 8. Training Inductive Support Vector Machines -- 9. Training Transductive Support Vector Machines -- 10. Conclusions.{\textless}/p{\textgreater}} },
-
Y. Lin, "Support vector machines and the Bayes rule in classification," Data Mining and Knowledge Discovery, vol. 6, iss. 3, pp. 259-275, 2002.
@article{lin_support_2002, title = {Support vector machines and the Bayes rule in classification},
volume = {6},
issn = {13845810 {(Print)} {1573756X} {(Online)}},
url = {http://www.springerlink.com/content/u7k2uj4jdxndf3ux/fulltext.pdf},
doi = {10.1023/A:1015469627679},
abstract = {The Bayes rule is the optimal classification rule if the underlying distribution of the data is known. In practice we do not know the underlying distribution, and need to “learn” classification rules from the data. One way to derive classification rules in practice is to implement the Bayes rule approximately by estimating an appropriate classification function. Traditional statistical methods use estimated log odds ratio as the classification function. Support vector machines {(SVMs)} are one type of large margin classifier, and the relationship between {SVMs} and the Bayes rule was not clear. In this paper, it is shown that the asymptotic target of {SVMs} are some interesting classification functions that are directly related to the Bayes rule. The rate of convergence of the solutions of {SVMs} to their corresponding target functions is explicitly established in the case of {SVMs} with quadratic or higher order loss functions and spline kernels. Simulations are given to illustrate the relation between {SVMs} and the Bayes rule in other cases. This helps understand the success of {SVMs} in many classification studies, and makes it easier to compare {SVMs} and traditional statistical methods.},
number = {3},
journal = {Data Mining and Knowledge Discovery},
author = {Yi Lin},
year = {2002},
keywords = {Apprentissage machine, Classification},
pages = {259--275},
annote = {{{\textless}p{\textgreater}linYi2002.pdf{\textless}/p{\textgreater}}} },
-
F. Sebastiani, Machine learning in automated text categorization: a bibliography, 2002.
@misc{sebastiani_machine_2002, title = {Machine learning in automated text categorization: a bibliography},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6989&rep=rep1&type=pdf},
abstract = {The automated categorization (or classification) of texts into predefined categories has witnessed a booming interest in the last ten years, due to the increased availability of documents in digital form and the ensuing need to organize them. In the research community the dominant approach to this problem is based on machine learning techniques: a general inductive process automatically builds a classifier by learning, from a set of preclassified documents, the characteristics of the...},
author = {Fabrizio Sebastiani},
year = {2002},
keywords = {Apprentissage machine, Catégorisation},
annote = {{{\textless}p{\textgreater}sebastianiFabrizio2002\_1.pdf{\textless}/p{\textgreater}}} },
-
Y. Yang, S. Slattery, and R. Ghani, "A study of approaches to hypertext categorization," Journal of intelligent information systems, vol. 18, iss. 2-3, pp. 219-241, 2002.
@article{yang_study_2002, title = {A study of approaches to hypertext categorization},
volume = {18},
url = {http://dx.doi.org/10.1023/A:1013685612819},
doi = {10.1023/A:1013685612819},
abstract = {Hypertext poses new research challenges for text classification. Hyperlinks, {HTML} tags, category labels distributed over linked documents, and meta data extracted from related Web sites all provide rich information for classifying hypertext documents. How to appropriately represent that information and automatically learn statistical patterns for solving hypertext classification problems is an open question. This paper seeks a principled approach to providing the answers. Specifically, we define five hypertext regularities which may (or may not) hold in a particular application domain, and whose presence (or absence) may significantly influence the optimal design of a classifier. Using three hypertext datasets and three well-known learning algorithms {(Naive} Bayes, Nearest Neighbor, and First Order Inductive Learner), we examine these regularities in different domains, and compare alternative ways to exploit them. Our results show that the identification of hypertext regularities in the data and the selection of appropriate representations for hypertext in particular domains are crucial, but seldom obvious, in real-world problems. We find that adding the words in the linked neighborhood to the page having those links (both inlinks and outlinks) were helpful for all our classifiers on one data set, but more harmful than helpful for two out of the three classifiers on the remaining datasets. We also observed that extracting meta data from related Web sites was extremely useful for improving classification accuracy in some of those domains. Finally, the relative performance of the classifiers being tested provided insights into their strengths and limitations for solving classification problems involving diverse and often noisy Web pages.},
number = {2-3},
journal = {Journal of intelligent information systems},
author = {Yiming Yang and Seán Slattery and Rayid Ghani},
year = {2002},
keywords = {Apprentissage machine, Catégorisation, Fouille de texte},
pages = {219--241},
annote = {{{\textless}p{\textgreater}yangYiming2002.pdf{\textless}/p{\textgreater}}} },
-
K. H. Lee, J. Kay, B. H. Kang, and U. Rosebrock, "A comparative study on statistical machine learning algorithms and thresholding strategies for automatic text categorization," in Proceedings of the 7th Pacific Rim International Conference on Artificial Intelligence: Trends in Artificial Intelligence, London, UK, 2002, pp. 444-453.
@inproceedings{lee_comparative_2002, address = {London, {UK}},
series = {Lecture Notes In Computer Science; 2417},
title = {A comparative study on statistical machine learning algorithms and thresholding strategies for automatic text categorization},
abstract = {Two main research areas in statistical text categorization are similarity-based learning algorithms and associated thresholding strategies. The combination of these techniques significantly influences the overall performance of text categorization. After investigating two similarity-based classifiers {(k-NN} and Rocchio) and three common thresholding techniques {(RCut,} {PCut,} and {SCut),} we describe a new learning algorithm known as the keyword association network {(KAN)} and a new thresholding strategy {(RinSCut)} to improve performance over existing techniques. Extensive experiments have been conducted on the Reuters-21578 and {20-Newsgroups} data sets. The experimental results show that our new approaches give better results for both micro-averaged F 1 and macro-averaged F 1 scores.},
booktitle = {Proceedings of the 7th Pacific Rim International Conference on Artificial Intelligence: Trends in Artificial Intelligence},
publisher = {{Springer-Verlag}},
author = {Kang Hyuk Lee and Judy Kay and Byeong Ho Kang and Uwe Rosebrock},
year = {2002},
keywords = {Apprentissage machine, Catégorisation},
pages = {444--453},
annote = {{{\textless}p{\textgreater}leeKang2002.pdf{\textless}/p{\textgreater}}} },
-
G. Balmisse, Les agentsKm center, 2002.
@misc{balmisse_les_2002, title = {Les agents},
url = {www.gillesbalmisse.com/IMG/pdf/GB_Agent.pdf},
abstract = {Il est extrêmement difficile de donner une définition du terme d'agent tant la notion même d'agent n'est pas unique et claire. Non seulement ce terme est utilisé pour désigner des applications qui sont la plupart du temps très différentes, mais aussi de nombreuses communautés venant d'horizon divers utilisent les agents pour des applications très différentes les unes des autres.},
publisher = {Km center},
author = {Gilles Balmisse},
month = sep, year = {2002},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}balmisseGilles2002.pdf{\textless}/p{\textgreater}}} },
-
A. Maedche, G. Neumann, S. Staab, and J. Kacprzyk, "Bootstrapping an ontology based information extraction system." Springer, 2002.
@incollection{maedche_bootstrappingontology_2002, series = {Studies in fuzziness and soft computing; 111},
title = {Bootstrapping an ontology based information extraction system},
url = {http://citeseer.ist.psu.edu/maedche02bootstrapping.html},
abstract = {Automatic intelligent web exploration will benefit from shallow information extraction techniques if the latter can be brought to work within many different domains. The major bottleneck for this, however, lies in the so far difficult and expensive modeling of lexical knowledge, extraction rules, and an ontology that together define the information extraction system. In this paper we present a bootstrapping approach that allows for the fast creation of an ontology-based information extracting system relying on several basic components, viz. a core information extraction system, an ontology engineering environment and an inference engine. We make extensive use of machine learning techniques to support the semi-automatic, incremental bootstrapping of the domain-specific target information extraction system.},
booktitle = {Intelligent exploration of the Web},
publisher = {Springer},
author = {A. Maedche and G. Neumann and S. Staab and J. Kacprzyk},
year = {2002},
keywords = {Apprentissage machine, Extraction d'information, Ontologie} },
-
F. Sebastiani, "Machine learning in automated text categorization," ACM Computing Surveys, vol. 34, iss. 1, pp. 1-47, 2002.
@article{sebastiani_machine_2002-1, title = {Machine learning in automated text categorization},
volume = {34},
issn = {03600300},
url = {http://portal.acm.org/ft_gateway.cfm?id=505283&type=pdf&coll=GUIDE&dl=GUIDE&CFID=5037508&CFTOKEN=67853060},
doi = {10.1145/505282.505283},
abstract = {The automated categorization (or classification) of texts into predefined categories has witnessed a booming interest in the last 10 years, due to the increased availability of documents in digital form and the ensuing need to organize them. In the research community the dominant approach to this problem is based on machine learning techniques: a general inductive process automatically builds a classifier by learning, from a set of preclassified documents, the characteristics of the categories. The advantages of this approach over the knowledge engineering approach (consisting in the manual definition of a classifier by domain experts) are a very good effectiveness, considerable savings in terms of expert labor power, and straightforward portability to different domains. This survey discusses the main approaches to text categorization that fall within the machine learning paradigm. We will discuss in detail issues pertaining to three different problems, namely, document representation, classifier construction, and classifier evaluation.},
number = {1},
journal = {{ACM} Computing Surveys},
author = {Fabrizio Sebastiani},
year = {2002},
keywords = {Apprentissage machine, Catégorisation, Classification},
pages = {1--47},
annote = {{{\textless}p{\textgreater}sebastianiFabrizio2002\_2.pdf{\textless}/p{\textgreater}}} },
-
P. Brito and D. Malerba, ECML/PKDD-2002 workshop programme, 2002.
@misc{brito_ecml/pkdd-2002_2002, title = {{ECML/PKDD-2002} workshop programme},
author = {Paula Brito and Donato Malerba},
month = aug, year = {2002},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}britoPaula2002.pdf{\textless}/p{\textgreater}}} },
-
X. Hu and E. Atwell, A survey of machine learning approaches to analysis of large corpora, 2002.
@misc{hu_survey_2002, title = {A survey of machine learning approaches to analysis of large corpora},
url = {http://citeseer.ist.psu.edu/565578.html},
abstract = {Corpus-based Machine Learning of linguistic annotations has been a key topic for all areas of Natural Language Processing. This paper presents a survey, along three dimensions of classification. First we outline different linguistic level of analysis: Tokenisation, {Part-of-Speech} tagging, Parsing, Semantic analysis and Discourse annotation. Secondly, we introduce alternative approaches to Machine Learning applicable to linguistic annotation of corpora: N-gram and Markov models, Neural ...},
author = {Xunlei Hu and Eric Atwell},
year = {2002},
keywords = {Analyse de corpus, Apprentissage machine},
annote = {{{\textless}p{\textgreater}huXunlei2002.pdf{\textless}/p{\textgreater}}} },
-
A. Lavelli, B. Magnini, and F. Sebastiani, "Building thematic lexical resources by bootstrapping and machine learning," In proc. of the workshop ”Linguistic knowledge acquisition and representation: bootstrapping annotated language data”, workshop at LREC-2002, 2002.
@article{lavelli_building_2002, title = {Building thematic lexical resources by bootstrapping and machine learning},
url = {nmis.isti.cnr.it/sebastiani/Publications/LREC02.pdf},
doi = {10.1.1.58.6925},
abstract = {We discuss work in progress in the semi-automatic generation of thematic lexicons by means of term categorization, a novel task employing techniques from information retrieval {(IR)} and machine learning {(ML).} Specifically, we view the generation of such lexicons as an iterative process of learning previously unknown associations between terms and themes (i.e. disciplines, or fields of activity). The process is iterative, in that it generates, for each ci in a set C = {c1,...,cm} of themes, a sequence L i 0 ⊆ L i 1 ⊆... ⊆ L i n of lexicons, bootstrapping from an initial lexicon L i 0 and a set of text corpora Θ={θ0,...,θn−1} given as input. The method is inspired by text categorization, the discipline concerned with labelling natural language texts with labels from a predefined set of themes, or categories. However, while text categorization deals with documents represented as vectors in a space of terms, we formulate the task of term categorization as one in which terms are (dually) represented as vectors in a space of documents, and in which terms (instead of documents) are labelled with themes. As a learning device, we adopt boosting, since (a) it has demonstrated state-of-the-art effectiveness in a variety of text categorization applications, and (b) it naturally allows for a form of “data cleaning”, thereby making the process of generating a thematic lexicon an iteration of generate-and-test steps.},
journal = {In proc. of the workshop {”Linguistic} knowledge acquisition and representation: bootstrapping annotated language data”, workshop at {LREC-2002}},
author = {Alberto Lavelli and Bernardo Magnini and Fabrizio Sebastiani},
year = {2002},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}lavelliAlberto2002.pdf{\textless}/p{\textgreater}}} },
-
T. Evgeniou and M. Pontil, "Support vector machines : theory and applications." Berlin; Heidelberg: Springer, 2001, pp. 249-257.
@incollection{evgeniou_support_2001, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 2049},
title = {Support vector machines : theory and applications},
isbn = {978-3-540-42490-1},
shorttitle = {Support vector machines},
abstract = {This chapter presents a summary of the issues discussed during the one day workshop on {“Support} Vector Machines {(SVM)} Theory and Applications” organized as part of the Advanced Course on Artificial Intelligence {(ACAI} ’99) in Chania, Greece [19]. The goal of the chapter is twofold: to present an overview of the background theory and current understanding of {SVM,} and to discuss the papers presented as well as the issues that arose during the workshop.},
booktitle = {Machine learning and its applications : advance lectures},
publisher = {Springer},
author = {Theodoros Evgeniou and Massimiliano Pontil},
year = {2001},
keywords = {Apprentissage machine},
pages = {249--257},
annote = {{{\textless}p{\textgreater}evgeniouTheodoros2001.pdf{\textless}/p{\textgreater}}} },
-
Y. Even-Zuhar, "Multi-class classification in natural language processing," PhD Thesis , 2001.
@phdthesis{even-zuhar_multi-class_2001, title = {Multi-class classification in natural language processing},
abstract = {A large number of important decision problems in the natural language domain can be viewed as problems of resolving ambiguity based on properties of the surrounding environment. For example, consider a word prediction task, i.e., predicting a missing word in a sentence. This problem can be viewed as classification problems in which the goal is to select a class label from a collection of class label candidates. Additional examples of such problems include part of speech tagging, word-sense disambiguation, accent restoration, word selection in speech recognition, etc. Machine learning methods have become the most popular technique for addressing a variety of classification problems. However, in many natural language classification problems one needs to deal with two significant sources of difficulty: (i) The information, which is readily available in the sentence in the form of words, is not sufficient to resolve ambiguity by the learning algorithm. (ii) Large number of class label candidates. General purpose learning algorithms are not suited to handle multi-class classification problems well. Therefore, avoiding it by focusing on problems with a small set of candidates (typically two). This thesis addresses the aforementioned difficulties. We develop a model for multi-class classification that works by sequentially reducing the number of candidates. This model is combined with a strategy for extracting expressive knowledge from the sentence to improve the accuracy of the final classifier. Thus, we decompose the classification problem into two modules: (1) Disambiguating among a small set of class labels. (2) Reducing the number of class label candidates. Given an instance of the task and a large set of candidate class labels, reduce the number of candidates by taking a new "multiplicative-like" approach towards classification. We name this approach sequential model . This thesis presents theoretical and empirical arguments for the advantages of using: (i) Sentence structure. (ii) The Sequential Model . Empirical arguments are given using word-prediction and part of speech tagging tasks. Theoretical arguments present this thesis as an extension of the current classification methods which aim at disambiguating among many classes.},
school = {University of Illinois at {Urbana-Champaign,} Dissertation Abstracts International, Section B: The Sciences and Engineering},
author = {Yair {Even-Zuhar}},
year = {2001},
note = {Doctorat},
keywords = {Apprentissage machine, Classification, Langage naturel},
pages = {148},
annote = {{{\textless}p{\textgreater}even-zuharYair1997.pdf{\textless}/p{\textgreater}}} },
-
M. Benkhalifa, A. Mouradi, and H. Bouyakhf, "Integrating external knowledge to supplement training data in semi-supervised learning for text categorization," Information retrieval, vol. 4, iss. 2, pp. 91-113, 2001.
@article{benkhalifa_integrating_2001, title = {Integrating external knowledge to supplement training data in semi-supervised learning for text categorization},
volume = {4},
issn = {1386-4564},
url = {http://www.springerlink.com/content/lx41k623j7v66962/fulltext.pdf},
doi = {10.1023/A:1011458711300},
abstract = {Text Categorization {(TC)} is the automated assignment of text documents to predefined categories based on document contents. {TC} has been an application for many learning approaches, which prove effective. Nevertheless, {TC} provides many challenges to machine learning. In this paper, we suggest, for text categorization, the integration of external {WordNet} lexical information to supplement training data for a semi-supervised clustering algorithm which can learn from both training and test documents to classify new unseen documents. This algorithm is the {“Semi-Supervised} Fuzzy {c-Means”} {(ssFCM).} Our experiments use Reuters 21578 database and consist of binary classifications for categories selected from the 115 {TOPICS} classes of the Reuters collection. Using the Vector Space Model, each document is represented by its original feature vector augmented with external feature vector generated using {WordNet.} We verify experimentally that the integration of {WordNet} helps {ssFCM} improve its performance, effectively addresses the classification of documents into categories with few training documents and does not interfere with the use of training data.},
number = {2},
journal = {Information retrieval},
author = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf},
month = jul, year = {2001},
keywords = {Apprentissage machine, Catégorisation},
pages = {91--113},
annote = {{{\textless}p{\textgreater}benkhalifaMohammed2001.pdf{\textless}/p{\textgreater}}} },
-
T. Hastie, R. Tibshirani, and J. H. Friedman, The elements of statistical learning : data mining, inference, and prediction, New York: Springer, 2001.
@book{hastie_elements_2001, address = {New York},
series = {Springer series in statistics},
title = {The elements of statistical learning : data mining, inference, and prediction},
isbn = {0387952845 {(ALK.} {PAPER)}},
publisher = {Springer},
author = {Trevor Hastie and Robert Tibshirani and J. H. Friedman},
year = {2001},
keywords = {Apprentissage machine, Approche statistique, Fouille de donnée} },
-
A. Maedche and S. Staab, "Ontology learning from text." Springer, 2001, pp. 364-364.
@incollection{maedche_ontology_2001, series = {Lecture notes in computer science; 1959},
title = {Ontology learning from text},
isbn = {0302-9743},
booktitle = {Natural language processing and information systems : 5th international conference on applications of natural language to information systems, {NLDB} 2000, Versailles, France, june 28-30, 2000 : revised papers},
publisher = {Springer},
author = {A. Maedche and S. Staab},
year = {2001},
keywords = {Apprentissage machine, Ontologie},
pages = {364--364} },
-
T. Kohonen, Self-organizing maps, 3rd ed. ed., Berlin: Springer, 2001.
@book{kohonen_self-organizing_2001, address = {Berlin},
edition = {3rd ed.},
title = {Self-organizing maps},
isbn = {3540679219},
abstract = {The {Self-Organizing} Map {(SOM),} with its variants, is the most popular artificial neural network algorithm in the unsupervised learning category. Many fields of science have adopted the {SOM} as a standard analytical tool: in statistics,signal processing, control theory, financial analyses, experimental physics, chemistry and medicine. A new area is organization of very large document collections. The {SOM} is also one of the most realistic models of the biological brain functions. This new edition includes a survey of over 2000 contemporary studies to cover the newest results; the case examples were provided with detailed formulae, illustrations and tables; a new chapter on software tools for {SOM} was written, other chapters were extended or reorganized.},
publisher = {Springer},
author = {Teuvo Kohonen},
year = {2001},
keywords = {Apprentissage machine, Catégorisation, Réseau de neurones, Visualisation de l'information} },
-
J. Shapiro, "Genetic algorithms in machine learning," Lecture notes in computer science, vol. 2049, pp. 146-168, 2001.
@article{shapiro_genetic_2001, title = {Genetic algorithms in machine learning},
volume = {2049},
issn = {0302-9743},
journal = {Lecture notes in computer science},
author = {Jonathan Shapiro},
year = {2001},
keywords = {Apprentissage machine},
pages = {146--168},
annote = {{{\textless}p{\textgreater}shapiroJonathan2001.pdf{\textless}/p{\textgreater}}} },
-
Marie-Francine and J. Dumortier, "Text categorization : the assignment of subject descriptors to magazine articles," Information Processing and Management, vol. 36, iss. 6, pp. 841-861, 2000.
@article{moens_text_2000, title = {Text categorization : the assignment of subject descriptors to magazine articles},
volume = {36},
issn = {03064573},
url = {http://www.sciencedirect.com/science?_ob=MImg&_imagekey=B6VC8-40V4CH5-3-1C&_cdi=5948&_user=789722&_orig=search&_coverDate=11%2F01%2F2000&_sk=999639993&view=c&wchp=dGLbVlW-zSkzS&md5=a84e4e5d29250e2f914798090a3ebab9&ie=/sdarticle.pdf},
doi = {10.1016/S0306-4573(00)00012-1},
abstract = {Automatic text categorization is an important research area and has a potential for many text-based applications including text routing and filtering. Typical text classifiers learn from example texts that are manually categorized. When categorizing magazine articles with broad subject descriptors, we study three aspects of text classification: (1) effective selection of feature words and proper names that reflect the main topics of the text; (2) learning algorithms; and (3) improvement of the quality of the learned classifier by selection of examples. The χ2 test, which is sometimes used for selecting terms that are highly related to a text class, is applied in a novel way when constructing a category weight vector. Despite a limited number of training examples, combining an effective feature selection with the χ2 learning algorithm for training the text classifier results in an adequate categorization of new magazine articles.},
number = {6},
journal = {Information Processing and Management},
author = {{Marie-Francine} Moens and Jos Dumortier},
year = {2000},
keywords = {Apprentissage machine, Catégorisation, Indexation},
pages = {841--861},
annote = {{{\textless}p{\textgreater}moensMarie-francine2000.pdf{\textless}/p{\textgreater}}} },
-
M. Craven, D. DiPasquo, D. Freitag, A. McCallum, T. Mitchell, K. Nigam, and S. Slattery, "Learning to construct knowledge bases from the World Wide Web," Artificial Intelligence, vol. 118, iss. 1-2, pp. 69-113, 2000.
@article{craven_learning_2000, title = {Learning to construct knowledge bases from the World Wide Web},
volume = {118},
url = {http://www.sciencedirect.com/science/article/B6TYF-43FX0XK-3/2/8610b9d209e3e80a5ea6dfb53abdd711},
abstract = {The World Wide Web is a vast source of information accessible to computers, but understandable only to humans. The goal of the research described here is to automatically create a computer understandable knowledge base whose content mirrors that of the World Wide Web. Such a knowledge base would enable much more effective retrieval of Web information, and promote new uses of the Web to support knowledge-based inference and problem solving. Our approach is to develop a trainable information extraction system that takes two inputs. The first is an ontology that defines the classes (e.g., , , , ) and relations (e.g., , ) of interest when creating the knowledge base. The second is a set of training data consisting of labeled regions of hypertext that represent instances of these classes and relations. Given these inputs, the system learns to extract information from other pages and hyperlinks on the Web. This article describes our general approach, several machine learning algorithms for this task, and promising initial results with a prototype system that has created a knowledge base describing university people, courses, and research projects.},
number = {1-2},
journal = {Artificial Intelligence},
author = {Mark Craven and Dan {DiPasquo} and Dayne Freitag and Andrew {McCallum} and Tom Mitchell and Kamal Nigam and Sean Slattery},
year = {2000},
keywords = {Apprentissage machine, Classification, Extraction d'information, Ontologie, Web},
pages = {69--113} },
-
P. Perner and M. Petrou, Machine learning and data mining in pattern recognition : first International Workshop, MLDM’99, Leipzig, Germany, September 16-18, 1999 : proceedings, Berlin ; New York: Springer, 1999.
@book{perner_machine_1999, address = {Berlin ; New York},
title = {Machine learning and data mining in pattern recognition : first International Workshop, {MLDM'99,} Leipzig, Germany, September 16-18, 1999 : proceedings},
isbn = {3540665994 {(SOFTCOVER)}},
publisher = {Springer},
author = {Petra Perner and Maria Petrou},
year = {1999},
keywords = {Apprentissage machine, Fouille de donnée},
annote = {{{\textless}p{\textgreater}MLDM\'99} (1st : 1999 : Leipzig, Germany) Petra Perner, Maria Petrou (eds.). ill. ; 24 cm. Learning in Pattern Recognition / Advances in Predictive Data Mining Methods / Multi-valued and Universal Binary Neurons: Learning Algorithms, Application to Image Processing and Recognition / A Dynamics of the Hough Transform and Artificial Neural Networks / Applications of Cellular Neural Networks for Shape from Shading Problem / Unsupervised Learning of Local Mean Grey Values for Image {Pre-Processing} / Neural Networks in {MR} Image Estimation from Sparsely Sampled Scans / Extraction of Local Structural Features in Images by Using a Multi-scale Relevance Function / Independent Feature Analysis for Image Retrieval / Non-hierarchical Clustering with Rival Penalized Competitive Learning for Information Retrieval / Automatic Design of Multiple Classifier Systems by Unsupervised Learning / A Comparison between Neural Networks and Decision Trees / Symbolic Learning Techniques in Paper Document Processing / Recognition of Printed Music Score / Reproductive {Process-Oriented} Data Mining From Interactions between Human and Complex {ArtifactSystem} / Generalized Fuzzy Aggregation Operators / A Data Mining Application for Monitoring Environmental Risks /{\textless}/p{\textgreater}} },
-
G. Nault, V. Rialle, Jean-Guy, W. Banzhaf, J. Daida, A. E. Eiben, M. H. Garzon, V. Honavar, M. Jakiela, and R. E. Smith, "PROGEN : a genetic-based semi-automatic hypertext construction tool-first steps and experiment," , Orlando, Fla., 1999.
@inproceedings{nault_progen_1999, address = {Orlando, Fla.},
title = {{PROGEN} : a genetic-based semi-automatic hypertext construction tool-first steps and experiment},
publisher = {Morgan Kaufmann},
author = {Georges Nault and Vincent Rialle and {Jean-Guy} Meunier and Wolfgang Banzhaf and Jason Daida and Agoston E. Eiben and Max H. Garzon and Vasant Honavar and Mark Jakiela and Robert E. Smith},
month = jul, year = {1999},
keywords = {Apprentissage machine, Fouille de texte} },
-
G. Hirst, D. St-Onge, and C. Fellbaum, "Lexical chains as representations of context for the detection and correction of malapropisms," in Wordnet: an electronic lexical database, Cambridge, Mass., 1998, pp. 305-332.
@inproceedings{hirst_lexical_1998, address = {Cambridge, Mass.},
title = {Lexical chains as representations of context for the detection and correction of malapropisms},
abstract = {{[In]} this paper, we examine the idea of lexical chains as such a representation. We show how they can be constructed by means of {WordNet,} and how they can be applied in one particular linguistic task: the detection and correction of malapropisms.},
booktitle = {Wordnet: an electronic lexical database},
publisher = {{MIT} Press},
author = {Graeme Hirst and David {St-Onge} and Christiane Fellbaum},
year = {1998},
keywords = {Apprentissage machine},
pages = {305--332} },
-
C. Thornton, "Separability is a learner’s best friend," in Proceedings of the fourth neural computation and psychology workshop : connectionist representations, London, 1997, pp. 40-47.
@inproceedings{thornton_separability_1997, address = {London},
title = {Separability is a learner's best friend},
url = {http://www.cogs.susx.ac.uk/users/christ/papers/best-friend.pdf},
abstract = {Geometric separability is a generalisation of linear separability familiar to many from Minsky and Papert s analysis of the Perceptron learning method The concept forms a novel dimension along which to conceptualise learning methods The present paper shows how geometric separability can be de ned and demonstrates that it accurately predicts the performance of a at least one empirical learning method},
booktitle = {Proceedings of the fourth neural computation and psychology workshop : connectionist representations},
publisher = {{Springer-Verlag}},
author = {Chris Thornton},
year = {1997},
keywords = {Apprentissage machine},
pages = {40--47},
annote = {{{\textless}p{\textgreater}thorntonChris1997.pdf{\textless}/p{\textgreater}}} },
-
F. S. Osorio and B. Amy, "Apprentissage automatique constructif : un nouveau modèle neuro-symbolique = Constructive automatic learning : a new neuro-symbolic model," , Suisse, 1997.
@inproceedings{osorio_apprentissage_1997, address = {Suisse},
title = {Apprentissage automatique constructif : un nouveau modèle neuro-symbolique = Constructive automatic learning : a new neuro-symbolic model},
url = {http://cat.inist.fr/?aModele=afficheN&cpsidt=2283296},
abstract = {Depuis quelques années, les systèmes hybrides neuro-symboliques combinent avec succès réseaux de neurones artificiels et systèmes symboliques à base de connaissances, dans le but de mettre à profit de leurs points forts respectifs [4,5,9]. Nous présentons le système {INSS,} une nouvelle approche hybride basée sur les principes des réseaux {KBANN} [9]. Il représente une amélioration importante par rapport à son prédécesseur parce que l'apprentissage (très rapide) et l'extraction de connaissances sont réalisés de manière incrémental [2]. {INSS} propose une nouvelle méthodologie applicable à l'apprentissage automatique constructif avec des outils performants, même en présence de données incomplètes ou erronées.},
author = {Fernando S. Osorio and Bernard Amy},
year = {1997},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}osorioFernando1997.pdf{\textless}/p{\textgreater}}} },
-
M. Anthony, "Probabilistic analysis of learning in artificial neural networks: The PAC model and its variants," Neural computing surveys, vol. 1, 1997.
@article{anthony_probabilistic_1997, title = {Probabilistic analysis of learning in artificial neural networks: The {PAC} model and its variants},
volume = {1},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.41.1975&rep=rep1&type=pdf},
doi = {10.1.1.41.1975},
abstract = {There are a number of mathematical approaches to the study of learning and generalization in artificial neural networks. Here we survey the `probably approximately correct ' {(PAC)} model of learning and some of its variants. These models provide a probabilistic framework for the discussion of generalization and learning. This survey concentrates on the sample complexity questions in these models; that is, the emphasis is on how many examples should be used for training. Computational complexity considerations are briefly discussed for the basic {PAC} model. Throughout, the importance of the {Vapnik-Chervonenkis} dimension is highlighted. Particular attention is devoted to describing how the probabilistic models apply in the context of neural network learning, both for networks with binary-valued output and for networks with real-valued},
journal = {Neural computing surveys},
author = {Martin Anthony},
year = {1997},
keywords = {Apprentissage machine, Approche probabiliste, Réseau de neurones},
annote = {{{\textless}p{\textgreater}anthonyMartin1997.pdf{\textless}/p{\textgreater}}} },
-
Y. Yang, An evaluation of statistical approaches to text categorization, 1997.
@misc{yang_evaluation_1997, title = {An evaluation of statistical approaches to text categorization},
url = {reports-archive.adm.cs.cmu.edu/anon/1997/CMU-CS-97-127.ps},
abstract = {This paper is a comparative study of test categorization methods. Fourteen methods are investigated, based on previously published results and newly obtained results from additional experiments. Corps biases in commonly used document collection are examined using the performance of three classifiers. Problems in previously published experiments are analyzed, and the results of flawed experiments are excluded from the cross-method evaluation. As a result, eleven out of the fourteen methods are remained. A k-nearest neighbor {(kNN)} classifier was chosen for the performance baseline on several collections; on each collection, the performance scores of other methods were normalized using the score of {kNN.} This provides a common basis for a global observation on methods whose results are only available on individual collections. {Widrow-Hoff,} k-nearest neighbor, neural networks and the Linear Least Squares Fit mapping are the top-performing classifiers, while the Rocchio approaches had relatively poor results compared to the other learning methods. {KNN} is the only learning method that has scaled to the full domain of {MEDLINE} categories, showing a graceful behavior when the target space grows from the level of one hundred categories to a level of tens of thousands},
author = {Yiming Yang},
month = apr, year = {1997},
keywords = {Apprentissage machine, Catégorisation},
annote = {{{\textless}p{\textgreater}yangYiming1997\_1.pdf{\textless}/p{\textgreater}}} },
-
T. M. Mitchell, Machine Learning, New York: McGraw-Hill, 1997.
@book{mitchell_machine_1997, address = {New York},
title = {Machine Learning},
isbn = {0070428077},
publisher = {{McGraw-Hill}},
author = {Tom M. Mitchell},
year = {1997},
keywords = {Apprentissage machine} },
-
K. Jim, J. Lai, and B. Wüthrich, "A data mining algorithm optimal for single rules," in Deductive and object-oriented databases : 5th international conference, DOOD’97 Montreux, Switzerland, December 8–12, 1997 : proceedings, Berlin; Heidelberg, 1997, pp. 368-385.
@inproceedings{jim_data_1997, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 1341},
title = {A data mining algorithm optimal for single rules},
isbn = {3-540-63792-3},
url = {http://portal.acm.org/citation.cfm?id=645347.650654&coll=GUIDE&dl=GUIDE},
abstract = {Today's rule mining algorithms all use greedy approaches to generate rules representing the knowledge hidden in vast amounts of data. When using a greedy approach, systems cannot guarantee that optimal rules are found. On the other hand, exhaustive search algorithms find optimal rules. But due to the vast search spaces, exhaustive search algorithms are in most cases impractically slow. This paper presents the A*-like rule mining algorithm {DA-2.} Similarly to exhaustive search algorithms, {DA-2} also finds optimal rules. Its running time, however, is just slightly longer than the running time of greedy algorithms.},
booktitle = {Deductive and object-oriented databases : 5th international conference, {DOOD'97} Montreux, Switzerland, December 8–12, 1997 : proceedings},
publisher = {{Springer-Verlag}},
author = {K. Jim and Jeffrey Lai and Beat Wüthrich},
year = {1997},
keywords = {Apprentissage machine, Fouille de donnée},
pages = {368--385},
annote = {{{\textless}p{\textgreater}jimK1997.pdf{\textless}/p{\textgreater}}} },
-
I. Moulinier and J. Ganascia, "Applying an existing machine learning algorithm to text categorization," Connectionist, Statistical, and Symbolic Approaches to Learning for Natural Language Processing, p. 343, 1996.
@article{moulinier_applyingexisting_1996, title = {Applying an existing machine learning algorithm to text categorization},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.51.4203},
doi = {10.1.1.51.4203},
abstract = {The information retrieval community is becoming increasingly interested in machine learning techniques, of which text categorization is an application. This paper describes how we have applied an existing similarity-based learning algorithm, Charade, to the text categorization problem and compares the results with those obtained using decision tree construction algorithms. From a machine learning point of view, this study was motivated by the size of the inspected data in such applications. Using the same representation of documents, Charade offers better performance than earlier reported experiments with decision trees on the same corpus. In addition, the way in which learning with redundancy influences categorization performance is also studied.},
journal = {Connectionist, Statistical, and Symbolic Approaches to Learning for Natural Language Processing},
author = {Isabelle Moulinier and Jean-gabriel Ganascia},
year = {1996},
keywords = {Apprentissage machine, Catégorisation},
pages = {343---354},
annote = {{{\textless}p{\textgreater}moulinierIsabelle1996\_1.pdf{\textless}/p{\textgreater}}} },
-
L. P. Kaelbling, M. L. Littman, and A. W. Moore, "Reinforcement learning : a survey," Journal of artificial intelligence research, vol. 4, p. 237, 1996.
@article{kaelbling_reinforcement_1996, title = {Reinforcement learning : a survey},
volume = {4},
shorttitle = {Reinforcement learning},
url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=C36BFA4DC43F4C73EA8A7DA5AE73F43C?doi=10.1.1.44.2707&rep=rep1&type=pdf},
doi = {10.1.1.44.2707},
abstract = {This paper surveys the field of reinforcement learning from a computer-science perspective. It is written to be accessible to researchers familiar with machine learning. Both the historical basis of the field and a broad selection of current work are summarized. Reinforcement learning is the problem faced by an agent that learns behavior through trial-and-error interactions with a dynamic environment. The work described here has a resemblance to work in psychology, but differs considerably in the details and in the use of the word \"reinforcement. \" The paper discusses central issues of reinforcement learning, including trading off exploration and exploitation, establishing the foundations of the field via Markov decision theory, learning from delayed reinforcement, constructing empirical models to accelerate learning, making use of generalization and hierarchy, and coping with hidden state. It concludes with a survey of some implemented systems and an assessment of the practical utility of current methods for reinforcement learning.},
journal = {Journal of artificial intelligence research},
author = {Leslie Pack Kaelbling and Michael L Littman and Andrew W Moore},
year = {1996},
keywords = {Apprentissage machine},
pages = {237---285},
annote = {{{\textless}p{\textgreater}kaelblingLeslie1996.pdf{\textless}/p{\textgreater}}} },
-
M. Kubat, I. Bratko, and R. Michalski, "A review of machine learning methods," , 1996.
@article{kubat_review_1996, title = {A review of machine learning methods},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.20.4195},
doi = {10.1.1.20.4195},
abstract = {The field of machine learning was conceived nearly four decades ago with the bold objective to develop computational methods that would implement various forms of learning, in particular mechanisms capable of inducing knowledge from examples or},
author = {Miroslav Kubat and Ivan Bratko and Ryszard Michalski},
year = {1996},
keywords = {Apprentissage machine},
annote = {{{\textless}p{\textgreater}kubatMiroslav1996.pdf{\textless}/p{\textgreater}}} },
-
D. Michie, D. J. Spiegelhalter, and C. C. Taylor, "Machine learning, neural and statistical classification," , 1994.
@article{michie_machine_1994, title = {Machine learning, neural and statistical classification},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.27.355&rep=rep1&type=pdf},
doi = {10.1.1.27.355},
author = {D. Michie and D. J Spiegelhalter and C. C Taylor},
month = feb, year = {1994},
keywords = {Apprentissage machine, Classification},
annote = {{{\textless}p{\textgreater}michieD1994.pdf{\textless}/p{\textgreater}}} },
-
J. R. Quinlan, C4.5 : programs for machine learning, San Mateo, Calif.: Morgan Kaufmann, 1993.
@book{quinlan_c4.5_1993, address = {San Mateo, Calif.},
title = {C4.5 : programs for machine learning},
abstract = {Classifier systems play a major role in machine learning and knowledge-based systems, and Ross Quinlan's work on {ID3} and C4.5 is widely acknowledged to have made some of the most significant contributions to their development. This book is a complete guide to the C4.5 system as implemented in C for the {UNIX} environment. It contains a comprehensive guide to the system's use , the source code (about 8,800 lines), and implementation notes. The source code and sample datasets are also available for download (see below). C4.5 starts with large sets of cases belonging to known classes. The cases, described by any mixture of nominal and numeric properties, are scrutinized for patterns that allow the classes to be reliably discriminated. These patterns are then expressed as models, in the form of decision trees or sets of if-then rules, that can be used to classify new cases, with emphasis on making the models understandable as well as accurate. The system has been applied successfully to tasks involving tens of thousands of cases described by hundreds of properties. The book starts from simple core learning methods and shows how they can be elaborated and extended to deal with typical problems such as missing data and over hitting. Advantages and disadvantages of the C4.5 approach are discussed and illustrated with several case studies. This book and software should be of interest to developers of classification-based intelligent systems and to students in machine learning and expert systems courses.},
publisher = {Morgan Kaufmann},
author = {J. R. Quinlan},
year = {1993},
keywords = {Apprentissage machine} },
-
M. A. Cohen and S. Grossberg, "Masking fields : a massively parallel neural architecture for learning, recognizing, and predicting multiple grouping of patterned data." Cambridge, Mass.: MIT Press, 1988, pp. 317-367.
@incollection{cohen_masking_1988, address = {Cambridge, Mass.},
title = {Masking fields : a massively parallel neural architecture for learning, recognizing, and predicting multiple grouping of patterned data},
isbn = {{026207107X}},
booktitle = {Neural networks and natural intelligence},
publisher = {{MIT} Press},
author = {Michael A. Cohen and Stephen Grossberg},
year = {1988},
keywords = {Apprentissage machine, Cluster},
pages = {317--367} },
-
Y. Kodratoff, Machine learning and data mining.
@misc{kodratoff_machine_????, title = {Machine learning and data mining},
abstract = {Deep differences explain why Data Mining has been enthusiastically accepted by Industry, while Machine Learning and Exploratory Statistics still have problems being accepted by it. This paper points at all the epistemological, scientific, and industrial differences between the two, and explains why Data Mining is better accepted in Industry.},
author = {Yves Kodratoff},
keywords = {Apprentissage machine, Fouille de donnée},
annote = {{{\textless}p{\textgreater}kodratoffYves.doc{\textless}/p{\textgreater}}} },
-
M. El-Bèze, J. M. Torres-Moreno, and F. Béchet, "Un duel probabiliste pour départager deux présidents," Revue des nouvelles technologies de l’information, iss. E-10, 2007.
@article{el-bze_un_2007, title = {Un duel probabiliste pour départager deux présidents},
url = {http://lia.univ-avignon.fr/fich_art/895-RNTI_elbeze_torres_bechet.pdf},
number = {E-10},
journal = {Revue des nouvelles technologies de l'information},
author = {Marc {El-Bèze} and Juan Manuel {Torres-Moreno} and Frédéric Béchet},
year = {2007},
keywords = {Approche probabiliste},
annote = {{{\textless}p{\textgreater}el-bezeMarc2007.pdf{\textless}/p{\textgreater}}} },
-
D. Newman, C. Chemudugunta, P. Smyth, and M. Steyvers, "Analyzing entities and topics in news articles using statistical topic models," in Intelligence and security informatics : IEEE international conference on intelligence and security informatics, ISI 2006, San Diego, CA, USA, may 23-24, 2006 : proceedings, Berlin; New York, 2006, pp. 93-104.
@inproceedings{newman_analyzing_2006, address = {Berlin; New York},
series = {Lecture notes in computer science; 3975},
title = {Analyzing entities and topics in news articles using statistical topic models},
isbn = {3540344780},
doi = {10.1007/11760146_9},
abstract = {Statistical language models can learn relationships between topics discussed in a document collection and persons, organizations and places mentioned in each document. We present a novel combination of statistical topic models and named-entity recognizers to jointly analyze entities mentioned (persons, organizations and places) and topics discussed in a collection of 330,000 New York Times news articles. We demonstrate an analytic framework which automatically extracts from a large collection: topics; topic trends; and topics that relate entities.},
booktitle = {Intelligence and security informatics : {IEEE} international conference on intelligence and security informatics, {ISI} 2006, San Diego, {CA,} {USA,} may 23-24, 2006 : proceedings},
publisher = {Springer},
author = {David Newman and Chaitanya Chemudugunta and Padhraic Smyth and Mark Steyvers},
year = {2006},
keywords = {Approche probabiliste, Approche statistique, Intelligence artificielle},
pages = {93--104},
annote = {{{\textless}p{\textgreater}newmanDavid2006.pdf{\textless}/p{\textgreater}}} },
-
A. Culotta, A. McCallum, and J. Betz, "Integrating probabilistic extraction models and data mining to discover relations and patterns in text," , New York, New York, 2006, pp. 296-303.
@inproceedings{culotta_integrating_2006, address = {New York, New York},
title = {Integrating probabilistic extraction models and data mining to discover relations and patterns in text},
url = {http://portal.acm.org/ft_gateway.cfm?id=1220873&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {In order for relation extraction systems to obtain human-level performance, they must be able to incorporate relational patterns inherent in the data (for example, that one's sister is likely one's mother's daughter, or that children are likely to attend the same college as their parents). Hand-coding such knowledge can be time-consuming and inadequate. Additionally, there may exist many interesting, unknown relational patterns that both improve extraction performance and provide insight into text. We describe a probabilistic extraction model that provides mutual benefits to both "top-down" relational pattern discovery and "bottom-up" relation extraction.},
publisher = {Association for Computational Linguistics},
author = {Aron Culotta and Andrew {McCallum} and Jonathan Betz},
year = {2006},
keywords = {Approche probabiliste, Fouille de donnée, Fouille de texte},
pages = {296--303},
annote = {{{\textless}p{\textgreater}culottaAron2006.pdf{\textless}/p{\textgreater}}} },
-
D. Downey, O. Etzioni, and S. Soderland, "A probabilistic model of redundancy in information extraction," Procs. of IJCAI, vol. 2005, 2005.
@article{downey_probabilistic_2005, title = {A probabilistic model of redundancy in information extraction},
volume = {2005},
journal = {Procs. of {IJCAI}},
author = {D. Downey and O. Etzioni and S. Soderland},
year = {2005},
keywords = {Approche probabiliste, Extraction d'information},
annote = {{{\textless}p{\textgreater}downeyDoug2005.pdf{\textless}/p{\textgreater}}} },
-
T. L. Griffiths and M. Steyvers, "Finding scientific topics," Proceedings of the National Academy of Sciences of the United States of America, vol. 101 Suppl. 1, pp. 5228-35, 2004.
@article{griffiths_finding_2004, title = {Finding scientific topics},
volume = {101 Suppl. 1},
issn = {00278424},
url = {http://www.pnas.org/content/101/suppl.1/5228.full.pdf+html},
doi = {14872004},
abstract = {A first step in identifying the content of a document is determining which topics that document addresses. We describe a generative model for documents, introduced by Blei, Ng, and Jordan {[Blei,} D. M., Ng, A. Y. \& Jordan, M. I. (2003) J. Machine Learn. Res. 3, 993-1022], in which each document is generated by choosing a distribution over topics and then choosing each word in the document from a topic selected according to this distribution. We then present a Markov chain Monte Carlo algorithm for inference in this model. We use this algorithm to analyze abstracts from {PNAS} by using Bayesian model selection to establish the number of topics. We show that the extracted topics capture meaningful structure in the data, consistent with the class designations provided by the authors of the articles, and outline further applications of this analysis, including identifying "hot topics" by examining temporal dynamics and tagging abstracts to illustrate semantic content.},
journal = {Proceedings of the National Academy of Sciences of the United States of America},
author = {Thomas L Griffiths and Mark Steyvers},
year = {2004},
keywords = {Approche probabiliste},
pages = {5228--35},
annote = {{{\textless}p{\textgreater}griffithsThomas2004.pdf{\textless}/p{\textgreater}}} },
-
A. Moschitti and R. Basili, "Complex linguistic features for text classification : a comprehensive study," in Advances in information retrieval : 26th European Conference on IR Research, ECIR 2004, Sunderland, UK, April 5-7, 2004 : proceedings, Berlin ; New York, NY, 2004, pp. 181-196.
@inproceedings{moschitti_complex_2004, address = {Berlin ; New York, {NY}},
series = {Lecture notes in computer science; 2997},
title = {Complex linguistic features for text classification : a comprehensive study},
isbn = {1558602070},
url = {dit.unitn.it/~moschitt/articles/ECIR2004.pdf},
abstract = {Previous researches on advanced representations for document retrieval have shown that statistical state-of-the-art models are not improved by a variety of different linguistic representations. Phrases, word senses and syntactic relations derived by Natural Language Processing {(NLP)} techniques were observed ineffective to increase retrieval accuracy. For Text Categorization {(TC)} are available fewer and less definitive studies on the use of advanced document representations as it is a relatively new research area (compared to document retrieval). In this paper, advanced document representations have been investigated. Extensive experimentation on representative classifiers, Rocchio and {SVM,} as well as a careful analysis of the literature have been carried out to study how some {NLP} techniques used for indexing impact {TC.} Cross validation over 4 different corpora in two languages allowed us to gather an overwhelming evidence that complex nominals, proper nouns and word senses are not adequate to improve {TC} accuracy.},
booktitle = {Advances in information retrieval : 26th European Conference on {IR} Research, {ECIR} 2004, Sunderland, {UK,} April 5-7, 2004 : proceedings},
publisher = {Springer},
author = {Alessandro Moschitti and Roberto Basili},
year = {2004},
keywords = {Analyse de contenu, Approche probabiliste, Catégorisation, Classification, Indexation, Langage naturel},
pages = {181--196},
annote = {{{\textless}p{\textgreater}moschittiAlessandro2004.pdf{\textless}/p{\textgreater}}} },
-
P. B. Dobrokhotov, C. Goutte, and E. Gaussier, "Combining NLP and probabilistic categorisation for document and term selection for Swiss-Prot medical annotation," Bioinformatics, vol. 19, iss. Supplement 1, pp. 91-94, 2003.
@article{dobrokhotov_combining_2003, title = {Combining {NLP} and probabilistic categorisation for document and term selection for {Swiss-Prot} medical annotation},
volume = {19},
url = {http://www.ingentaconnect.com/content/oup/cabios/2003/00000019/A00100s1/art00091},
abstract = {Motivation: Searching relevant publications for manual database annotation is a tedious task. In this paper, we apply a combination of Natural Language Processing {(NLP)} and probabilistic classification to re-rank documents returned by {PubMed} according to their relevance to {Swiss-Prot} annotation, and to identify significant terms in the documents. Results: With a Probabilistic Latent Categoriser {(PLC)} we obtained 69\% recall and 59\% precision for relevant documents in a representative query. As the {PLC} technique provides the relative contribution of each term to the final document score, we used the {Kullback-Leibler} symmetric divergence to determine the most discriminating words for {Swiss-Prot} medical annotation. This information should allow curators to understand classification results better. It also has great value for fine-tuning the linguistic pre-processing of documents, which in turn can improve the overall classifier performance. Availability: The medical annotation dataset is available from the authors upon request Contact: {Pavel.Dobrokhotov@isb-sib.ch;} {Cyril.Goutte@xrce.xerox.com}},
number = {Supplement 1},
journal = {Bioinformatics},
author = {P. B. Dobrokhotov and C. Goutte and E. Gaussier},
year = {2003},
keywords = {Approche probabiliste, Catégorisation},
pages = {91--94} },
-
S. K. Jones, S. E. Robertson, and S. Walker, "A probabilistic model of information retrieval : development and comparative experiments, parts 1 and 2," Information Processing and Management, vol. 36, iss. 6, pp. 779-808, 2000.
@article{sparck_jones_probabilistic_2000, title = {A probabilistic model of information retrieval : development and comparative experiments, parts 1 and 2},
volume = {36},
number = {6},
journal = {Information Processing and Management},
author = {K. Sparck Jones and S. E. Robertson and S. Walker},
year = {2000},
keywords = {Approche probabiliste, Recherche d'information},
pages = {779--808},
annote = {{{\textless}p{\textgreater}sparck\_jonesK2000\_partie1.pdf{\textless}/p{\textgreater}}},
annote = {{{\textless}p{\textgreater}sparck\_jonesK2000\_partie2.pdf{\textless}/p{\textgreater}}} },
-
D. Hiemstra, "A probabilistic justification for using tf×idf term weighting in information retrieval," International Journal on Digital Libraries, vol. 3, iss. 2, pp. 131-139, 2000.
@article{hiemstra_probabilistic_2000, title = {A probabilistic justification for using tf×idf term weighting in information retrieval},
volume = {3},
issn = {14325012 {(Print)} 14321300 {(Online)}},
doi = {10.1007/s007999900025},
abstract = {This paper presents a new probabilistic model of information retrieval. The most important modeling assumption made is that documents and queries are defined by an ordered sequence of single terms. This assumption is not made in well-known existing models of information retrieval, but is essential in the field of statistical natural language processing. Advances already made in statistical natural language processing will be used in this paper to formulate a probabilistic justification for using tf×idf term weighting. The paper shows that the new probabilistic interpretation of tf×idf term weighting might lead to better understanding of statistical ranking mechanisms, for example by explaining how they relate to coordination level ranking. A pilot experiment on the {TREC} collection shows that the linguistically motivated weighting algorithm outperforms the popular {BM25} weighting algorithm.},
number = {2},
journal = {International Journal on Digital Libraries},
author = {Djoerd Hiemstra},
year = {2000},
keywords = {Approche probabiliste, Recherche d'information},
pages = {131--139},
annote = {{{\textless}p{\textgreater}hiemstraDjoerd2000.pdf{\textless}/p{\textgreater}}
}
-
J. M. Wiebe, R. F. Bruce, N. Nicolov, and R. Mitkov, "Probabilistic event categorisation." Amsterdam, Netherlands: Benjamins, 2000, pp. 341-352.
@incollection{wiebe_probabilistic_2000, address = {Amsterdam, Netherlands},
title = {Probabilistic event categorisation},
booktitle = {Recent Advances in Natural Language Processing, {II}},
publisher = {Benjamins},
author = {Janyce M. Wiebe and Rebecca F. Bruce and Nicolas Nicolov and Ruslan Mitkov},
year = {2000},
keywords = {Approche probabiliste, Classification, Linguistique},
pages = {341--352} },
-
T. Joachims, "A probabilistic analysis of the rocchio algorithm with TFIDF for text categorization." 1997, pp. 143-151.
@inproceedings{joachims_probabilistic_1997, title = {A probabilistic analysis of the rocchio algorithm with {TFIDF} for text categorization},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.7950},
doi = {10.1.1.21.7950},
abstract = {The Rocchio relevance feedback algorithm is one of the most popular and widely applied learning methods from information retrieval. Here, a probabilistic analysis of this algorithm is presented in a text categorization framework. The analysis gives theoretical insight into the heuristics used in the Rocchio algorithm, particularly the word weighting scheme and the similarity metric. It also suggests improvements which lead to a probabilistic variant of the Rocchio classifier. The Rocchio classifier, its probabilistic variant, and a naive Bayes classifier are compared on six text categorization tasks. The results show that the probabilistic algorithms are preferable to the heuristic Rocchio classifier not only because they are more well-founded, but also because they achieve better performance.},
author = {Thorsten Joachims},
year = {1997},
keywords = {Approche probabiliste, Catégorisation},
pages = {143--151},
annote = {{{\textless}p{\textgreater}joachimsThorston1997.pdf{\textless}/p{\textgreater}}} },
-
M. Anthony, "Probabilistic analysis of learning in artificial neural networks: The PAC model and its variants," Neural computing surveys, vol. 1, 1997.
@article{anthony_probabilistic_1997, title = {Probabilistic analysis of learning in artificial neural networks: The {PAC} model and its variants},
volume = {1},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.41.1975&rep=rep1&type=pdf},
doi = {10.1.1.41.1975},
abstract = {There are a number of mathematical approaches to the study of learning and generalization in artificial neural networks. Here we survey the `probably approximately correct ' {(PAC)} model of learning and some of its variants. These models provide a probabilistic framework for the discussion of generalization and learning. This survey concentrates on the sample complexity questions in these models; that is, the emphasis is on how many examples should be used for training. Computational complexity considerations are briefly discussed for the basic {PAC} model. Throughout, the importance of the {Vapnik-Chervonenkis} dimension is highlighted. Particular attention is devoted to describing how the probabilistic models apply in the context of neural network learning, both for networks with binary-valued output and for networks with real-valued},
journal = {Neural computing surveys},
author = {Martin Anthony},
year = {1997},
keywords = {Apprentissage machine, Approche probabiliste, Réseau de neurones},
annote = {{{\textless}p{\textgreater}anthonyMartin1997.pdf{\textless}/p{\textgreater}}} },
-
J. Thorston, A probabilistic analysis of the rocchio algorithm with tfxidf for text categorization, 1996.
@misc{thorston_probabilistic_1996, title = {A probabilistic analysis of the rocchio algorithm with tfxidf for text categorization},
author = {Joachims Thorston},
year = {1996},
keywords = {Approche probabiliste, Catégorisation},
annote = {{{\textless}p{\textgreater}joachimsThorston1996.pdf{\textless}/p{\textgreater}}} },
-
R. M. Neal, "Probabilistic inference using Markov chain Monte Carlo methods," , 1993.
@article{neal_probabilistic_1993, title = {Probabilistic inference using Markov chain Monte Carlo methods},
doi = {10.1.1.46.8183},
abstract = {Probabilistic inference is an attractive approach to uncertain reasoning and empirical learning in arti cial intelligence. Computational di culties arise, however, because probabilistic models with the necessary realism and exibility lead to complex distributions over high-dimensional spaces. Related problems in other elds have been tackled using Monte Carlo methods based on sampling using Markov chains, providing a rich array of techniques that can be applied to problems in arti cial intelligence. The {{\textbackslash}Metropolis} algorithm" has been used to solve di cult problems in statistical physics for over forty years, and, in the last few years, the related method of {{\textbackslash}Gibbs} sampling" has been applied to problems of statistical inference. Concurrently, an alternative method for solving problems in statistical physics by means of dynamical simulation has been developed as well, and has recently been uni ed with the Metropolis algorithm to produce the {\textbackslash}hybrid Monte Carlo" method. In computer science, Markov chain sampling is the basis of the heuristic optimization technique of {\textbackslash}simulated annealing", and has recently been used in randomized algorithms for approximate counting of large sets. In this review, I outline the role of probabilistic inference in arti cial intelligence, present the theory of Markov chains, and describe various Markov chain Monte Carlo algorithms, along with a number of supporting techniques. I try to present a comprehensive picture of the range of methods that have been developed, including techniques from the varied literature that have not yet seen wide application in articial intelligence, but which appear relevant. As illustrative examples, I use the problems of probabilistic inference in expert systems, discovery of latent classes from data, and Bayesian learning for neural networks.},
author = {Radford M Neal},
year = {1993},
keywords = {Approche probabiliste, Méthodologie},
annote = {{{\textless}p{\textgreater}nealRadford1993.pdf{\textless}/p{\textgreater}}} },
-
F. Rosenblatt, "A probabilistic model for information storage and organization in the brain," Psychological Review, vol. 65, pp. 386-408, 1958.
@article{rosenblatt_probabilistic_1958, title = {A probabilistic model for information storage and organization in the brain},
volume = {65},
journal = {Psychological Review},
author = {F. Rosenblatt},
year = {1958},
keywords = {Approche probabiliste},
pages = {386--408} },
-
M. El-Bèze, . Torres-Moreno, and F. Béchet, Un duel probabiliste pour départager deux présidents.
@misc{el-bze_un_????, title = {Un duel probabiliste pour départager deux présidents},
abstract = {Nous présentons une palette de modèles probabilistes appliqués à la tâche de classification binaire telle que définie dans le cadre du défi {TALN/RECITAL} {DEFT’05.} Au sein de discours de Jacques Chirac, a pu être insérée une séquence de phrases de François Mitterrand. Pour identifier la paternité de ces séquences, nous avions utilisé des chaînes de Markov, des modèles bayésiens, et des procédures d’adaptation de ces modèles. Depuis, pour modéliser la cohérence interne des discours nous avons développé une méthode probabiliste qui améliore les performances. Une comparaison avec diverses approches montre la supériorité d’une stratégie combinant apprentissage, cohérence et adaptation. Les résultats que nous obtenons, en termes de précision (0,890), rappel (0,955) et Fscore (0,925) sur le sous-corpus de test Mitterrand sont très encourageants.},
author = {Marc {El-Bèze} and {Juan-Manuel} {Torres-Moreno} and Frédéric Béchet},
keywords = {Approche probabiliste},
annote = {{{\textless}p{\textgreater}el-bezeMarc.pdf{\textless}/p{\textgreater}}} },
-
Y. Li, C. Luo, and S.M., "Text clustering with feature selection by using statistical data," Knowledge and Data Engineering, IEEE Transactions on, vol. 20, iss. 5, pp. 641-652, 2008.
@article{yanjun_li_text_2008, title = {Text clustering with feature selection by using statistical data},
volume = {20},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/28407/01269663.pdf?tp=&isnumber=28407&arnumber=1269663&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2004.1269663},
abstract = {Feature selection is an important method for improving the efficiency and accuracy of text categorization algorithms by removing redundant and irrelevant terms from the corpus. In this paper, we propose a new supervised feature selection method, named {CHIR,} which is based on the chi2 statistic and new statistical data that can measure the positive term-category dependency. We also propose a new text clustering algorithm, named text clustering with feature selection {(TCFS).} {TCFS} can incorporate {CHIR} to identify relevant features (i.e., terms) iteratively, and the clustering becomes a learning process. We compared {TCFS} and the K-means clustering algorithm in combination with different feature selection methods for various real data sets. Our experimental results show that {TCFS} with {CHIR} has better clustering accuracy in terms of the F-measure and the purity.},
number = {5},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Yanjun Li and Congnan Luo and {S.M.} Chung},
year = {2008},
keywords = {Approche statistique, Cluster, Fouille de texte},
pages = {641--652},
annote = {{{\textless}p{\textgreater}liYanjun2008.pdf{\textless}/p{\textgreater}}} },
-
L. Zhou, Y. Shi, and D. Zhang, "A statistical language modeling approach to online deception detection," Knowledge and Data Engineering, IEEE Transactions on, vol. 20, iss. 8, pp. 1077-1081, 2008.
@article{zhou_statistical_2008, title = {A statistical language modeling approach to online deception detection},
volume = {20},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/4553782/04358936.pdf?tp=&isnumber=4553782&arnumber=4358936&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2007.190624},
abstract = {Online deception is disrupting our daily life, organizational process, and even national security. Existing approaches to online deception detection follow a traditional paradigm by using a set of cues as antecedents for deception detection, which may be hindered by ineffective cue identification. Motivated by the strength of statistical language models {(SLMs)} in capturing the dependency of words in text without explicit feature extraction, we developed {SLMs} to detect online deception. We also addressed the data sparsity problem in building {SLMs} in general and in deception detection in specific using smoothing and vocabulary pruning techniques. The developed {SLMs} were evaluated empirically with diverse datasets. The results showed that the proposed {SLM} approach to deception detection outperformed a state-of-the-art text categorization method as well as traditional feature-based methods.},
number = {8},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Lina Zhou and Yongmei Shi and Dongsong Zhang},
year = {2008},
keywords = {Apprentissage machine, Approche statistique},
pages = {1077--1081},
annote = {{{\textless}p{\textgreater}zhouLina2008.pdf{\textless}/p{\textgreater}}} },
-
J. H. F. Jr, "Knowledge creation in marketing: the role of predictive analytics," European Business Review, vol. 19, iss. 4, pp. 303-315, 2007.
@article{joe_f._hair_jr_knowledge_2007, title = {Knowledge creation in marketing: the role of predictive analytics},
volume = {19},
issn = {{0955-534X}},
url = {http://www.emeraldinsight.com/10.1108/09555340710760134},
abstract = {Purpose – The purpose of this paper is to provide an overview of predictive analytics, summarize how it is impacting knowledge creation in marketing, and suggest future developments in marketing and predictive analytics for both organizations and researchers. Design/methodology/approach – Survival in a knowledge-based economy is derived from the ability to convert information to knowledge. To do so, researchers and managers increasingly are relying on the field of predictive analytics. Data mining identifies and confirms relationships between explanatory and criterion variables. Predictive analytics uses confirmed relationships between variables to predict future outcomes. The predictions are most often values suggesting the likelihood a particular behavior or event will take place in the future. Findings – Data mining and predictive analytics are increasingly popular because of the substantial contributions they can make in converting information to knowledge. Marketing is among the most frequent applications of the techniques, and whether you think about product development, advertising, distribution and retailing, or marketing research and business intelligence, data mining and predictive analytics increasingly are being applied. Originality/value – In the future, we can expect predictive analytics to increasingly be applied to databases in all fields and revolutionize the ability to identify, understand and predict future developments, data analysts will increasingly rely on mixed-data models that examine both structured (numbers)and unstructured (text and images) data, statistical tools will be more powerful and easier to use, future applications will be global and real time, demand for data analysts will increase as will the need for students to learn data analysis methods, and scholarly researchers will need to improve their quantitative skills so the large amounts of information available can be used to create knowledge instead of information overload.},
number = {4},
journal = {European Business Review},
author = {Joe F. Hair Jr},
year = {2007},
keywords = {Approche statistique, Extraction d'information},
pages = {303 -- 315} },
-
I. Ayres, Super crunchers: why thinking-by-numbers is the new way to be smart, New York: Bantam Books, 2007.
@book{ayres_super_2007, address = {New York},
title = {Super crunchers: why thinking-by-numbers is the new way to be smart},
isbn = {9780553805406},
publisher = {Bantam Books},
author = {Ian Ayres},
year = {2007},
keywords = {Approche statistique} },
-
D. Newman, C. Chemudugunta, P. Smyth, and M. Steyvers, "Analyzing entities and topics in news articles using statistical topic models," in Intelligence and security informatics : IEEE international conference on intelligence and security informatics, ISI 2006, San Diego, CA, USA, may 23-24, 2006 : proceedings, Berlin; New York, 2006, pp. 93-104.
@inproceedings{newman_analyzing_2006, address = {Berlin; New York},
series = {Lecture notes in computer science; 3975},
title = {Analyzing entities and topics in news articles using statistical topic models},
isbn = {3540344780},
doi = {10.1007/11760146_9},
abstract = {Statistical language models can learn relationships between topics discussed in a document collection and persons, organizations and places mentioned in each document. We present a novel combination of statistical topic models and named-entity recognizers to jointly analyze entities mentioned (persons, organizations and places) and topics discussed in a collection of 330,000 New York Times news articles. We demonstrate an analytic framework which automatically extracts from a large collection: topics; topic trends; and topics that relate entities.},
booktitle = {Intelligence and security informatics : {IEEE} international conference on intelligence and security informatics, {ISI} 2006, San Diego, {CA,} {USA,} may 23-24, 2006 : proceedings},
publisher = {Springer},
author = {David Newman and Chaitanya Chemudugunta and Padhraic Smyth and Mark Steyvers},
year = {2006},
keywords = {Approche probabiliste, Approche statistique, Intelligence artificielle},
pages = {93--104},
annote = {{{\textless}p{\textgreater}newmanDavid2006.pdf{\textless}/p{\textgreater}}} },
-
J. Xing and T. Ah-Hwee, "Mining ontological knowledge from domain-specific text documents," , Houston, TX, USA, 2006, p. 4.
@inproceedings{xing_mining_2006, address = {Houston, {TX,} {USA}},
series = {Proceedings. Fifth {IEEE} International Conference on Data Mining},
title = {Mining ontological knowledge from domain-specific text documents},
abstract = {Traditional text mining systems employ shallow parsing techniques and focus on concept extraction and taxonomic relation extraction. This paper presents a novel system called {CRCTOL} for mining rich semantic knowledge in the form of ontology from domain-specific text documents. By using a full text parsing technique and incorporating both statistical and lexico-syntactic methods, the knowledge extracted by our system is more concise and contains a richer semantics compared with alternative systems. We conduct a case study wherein {CRCTOL} extracts ontological knowledge, specifically key concepts and semantic relations, from a terrorism domain text collection. Quantitative evaluation, by comparing with a state-of-the-art ontology learning system known as text-to-onto, has shown that {CRCTOL} produces much better precision and recall for both concept and relation extraction, especially from sentences with complex structures},
publisher = {{IEEE} Computer Society},
author = {Jiang Xing and Tan {Ah-Hwee}},
year = {2006},
note = {Copyright 2006, The Institution of Engineering and Technology},
keywords = {Analyse de texte, Approche statistique, Fouille de donnée, Ontologie},
pages = {4 pp.},
annote = {{\textless}p{\textgreater}8857416 ontological knowledge mining domain-specific text document text mining full text parsing statistical method lexico-syntactic method concept extraction relation extraction concept relation concept tuple ontology learning{\textless}/p{\textgreater}} },
-
C. Cortes, "Moment kernels for regular distributions," Machine Learning, vol. 60, iss. 1-3, pp. 117-134, 2005.
@article{cortes_moment_2005, title = {Moment kernels for regular distributions},
volume = {60},
url = {http://www.ingentaconnect.com/content/klu/ml/2005/00000060/F0030001/00000919},
abstract = {Many machine learning problems in natural language processing, transaction-log analysis, or computational biology, require the analysis of variable-length sequences, or, more generally, distributions of variable-length sequences. Kernel methods introduced for fixed-size vectors have proven very successful in a variety of machine learning tasks. We recently introduced a new and general kernel framework, rational kernels, to extend these methods to the analysis of variable-length sequences or more generally distributions given by weighted automata. These kernels are efficient to compute and have been successfully used in applications such as spoken-dialog classification with Support Vector Machines. However, the rational kernels previously introduced in these applications do not fully encompass distributions over alternate sequences. They are based only on the counts of co-occurring subsequences averaged over the alternate paths without taking into accounts information about the higher-order moments of the distributions of these counts. In this paper, we introduce a new family of rational kernels, moment kernels, that precisely exploits this additional information. These kernels are distribution kernels based on moments of counts of strings. We describe efficient algorithms to compute moment kernels and apply them to several difficult spoken-dialog classification tasks. Our experiments show that using the second moment of the counts of n-gram sequences consistently improves the classification accuracy in these tasks.},
number = {1-3},
journal = {Machine Learning},
author = {Corinna Cortes},
year = {2005},
keywords = {Approche statistique},
pages = {117--134},
annote = {{\textless}p{\textgreater}new family of rational kernels = moment kernels{\textless}/p{\textgreater}} },
-
M. Rossignol and P. Sébillot, "Combining statistical data analysis techniques to extract topical keyword classes from corpora," Intelligent Data Analysis, vol. 9, iss. 1, pp. 105-127, 2005.
@article{rossignol_combining_2005, title = {Combining statistical data analysis techniques to extract topical keyword classes from corpora},
volume = {9},
issn = {{1088467X}},
abstract = {We present an unsupervised method for the generation from a textual corpus of sets of keywords, that is, words whose occurrences in a text are strongly connected with the presence of a given topic. Each of these classes is associated with one of the main topics of the corpus, and can be used to detect the presence of that topic in any of its paragraphs, by a simple keyword co-occurrence criterion. The classes are extracted from the textual data in a fully automatic way, without requiring any a priori linguistic knowledge or making any assumptions about the topics to search for. The algorithms we have developed allow us to yield satisfactory and directly usable results despite the amount of noise inherent in textual data. That goal is reached thanks to a combination of several data analysis techniques. On a corpus of archives from the French monthly newspaper Le Monde Diplomatique, we obtain 40 classes of about 30 words each that accurately characterize precise topics, and allow us to detect their occurrences with a precision and recall of 85\% and 65\% respectively.},
number = {1},
journal = {Intelligent Data Analysis},
author = {Mathias Rossignol and Pascale Sébillot},
year = {2005},
keywords = {Approche statistique, Classification},
pages = {105--127},
annote = {{{\textless}p{\textgreater}rossignolMathias2005\_1.pdf{\textless}/p{\textgreater}}} },
-
K. S. Jones, "Some points in a time," Computational Linguistics, vol. 31, iss. 1, pp. 1-14, 2005.
@article{sprck_jones_points_2005, title = {Some points in a time},
volume = {31},
url = {http://www.mitpressjournals.org/doi/pdf/10.1162/0891201053630237},
doi = {10.1162/0891201053630237},
abstract = {This article offers a personal perspective on the development of language and information processing over the last half century, focusing on the use of statistical methods. Introduced, with computers, in the 1950s, these have not always been highly regarded, but were revived in the 1990s. They have proved effective in more ways than might have been expected, and encourage new thinking about what language and information processing involve.},
number = {1},
journal = {Computational Linguistics},
author = {Karen Spärck Jones},
year = {2005},
keywords = {Approche statistique, Méthodologie},
pages = {1--14},
annote = {{{\textless}p{\textgreater}sparckKaren2005.pdf{\textless}/p{\textgreater}}
}
-
S. Y. Jung, Jeong-Hee, and Taek-Soo, "A statistical model for user preference," Knowledge and Data Engineering, IEEE Transactions on, vol. 17, iss. 6, pp. 834-843, 2005.
@article{sung_young_jung_statistical_2005, title = {A statistical model for user preference},
volume = {17},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/30743/01423983.pdf?tp=&isnumber=30743&arnumber=1423983&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2005.86},
abstract = {Modeling user preference is one of the challenging issues in intelligent information systems. Extensive research has been performed to automatically analyze user preference and to utilize it. One problem still remains: The representation of preference, usually given by measure of vector similarity or probability, does not always correspond to common sense of preference. This problem gets worse in the case of negative preference. To overcome this problem, this paper presents a preference model using mutual information in a statistical framework. This paper also presents a method that combines information of joint features and alleviates problems arising from sparse data. Experimental results, compared with the previous recommendation models, show that the proposed model has the highest accuracy in recommendation tests.},
number = {6},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Sung Young Jung and {Jeong-Hee} Hong and {Taek-Soo} Kim},
year = {2005},
keywords = {Approche statistique},
pages = {834--843},
annote = {{{\textless}p{\textgreater}jungSung2005.pdf{\textless}/p{\textgreater}}} },
-
K. S. Jones, "A statistical interpretation of term specificity and its application in retrieval," Journal of documentation, vol. 60, iss. 5, pp. 493-502, 2004.
@article{sprck_jones_statistical_2004, title = {A statistical interpretation of term specificity and its application in retrieval},
volume = {60},
issn = {0022-0418},
abstract = {The exhaustivity of document descriptions and the specificity of index terms are usually regarded as independent. It is suggested that specificity should be interpreted statistically, as a function of term use rather than of term meaning. The effects on retrieval of variations in term specificity are examined, experiments with three test collections showing, in particular, that frequently-occurring terms are required for good overall performance. It is argued that terms should be weighted according to collection frequency, so that matches on less frequent, more specific, terms are of greater value than matches on frequent terms. Results for the test collections show that considerable improvements in performance are obtained with this very simple procedure.},
number = {5},
journal = {Journal of documentation},
author = {Karen Spärck Jones},
year = {2004},
keywords = {Approche statistique, Recherche d'information},
pages = {493--502},
annote = {{{\textless}p{\textgreater}sparckKaren2004\_1.pdf{\textless}/p{\textgreater}}} },
-
P. M. Lee, Bayesian statistics : an introduction, 3rd edition ed., London; New York: Arnold ; Wiley, 2004.
@book{lee_bayesian_2004, address = {London; New York},
edition = {3rd edition},
title = {Bayesian statistics : an introduction},
isbn = {0340814055},
publisher = {Arnold ; Wiley},
author = {Peter M. Lee},
year = {2004},
keywords = {Approche statistique},
annote = {{{\textless}p{\textgreater}TOC} : 1. Preliminaries -- 2. Bayesian inference for the normal distribution -- 3. Some other common distributions -- 4. Hypothesis testing -- 5. Two-sample problems -- 6. Correlation, regression and the analysis of variance -- 7. Other topics -- 8. Hierarchical models -- 9. The Gibbs sampler and other numerical methods.{\textless}/p{\textgreater}} },
-
F. Peng, Statistical natural language processing reading list, 2004.
@misc{peng_statistical_2004, title = {Statistical natural language processing reading list},
author = {Fuchun Peng},
month = apr, year = {2004},
keywords = {Approche statistique, Langage naturel},
annote = {{{\textless}p{\textgreater}pengFuchun2004.pdf{\textless}/p{\textgreater}}} },
-
"Predictive modeling : Naïve bayesian." [Data2Knowledge Corporation], 2003, pp. 8-13.
@incollection{_predictive_2003, title = {Predictive modeling : Naïve bayesian},
url = {http://algdocs.ncsa.uiuc.edu/TU-20030604-2.pdf},
booktitle = {{D2K} getting started tutorial},
publisher = {{[Data2Knowledge} Corporation]},
year = {2003},
keywords = {Approche statistique},
pages = {8--13},
annote = {{\textless}p{\textgreater}d2k.pdf{\textless}/p{\textgreater}} },
-
M. Rossignol and P. Sébillot, "Extraction statistique sur corpus de classes de mots-clés thématiques," Traitement automatique des langues, vol. 44, iss. 33, pp. 217-246, 2003.
-
W. H. Majoros, "Identification of key concepts in biomedical literature using a modified Markov heuristic," Bioinformatics, vol. 19, pp. 402-407, 2003.
@article{majoros_identification_2003, title = {Identification of key concepts in biomedical literature using a modified Markov heuristic},
volume = {19},
url = {http://www.ingentaconnect.com/content/oup/cabios/2003/00000019/00000003/art00402},
abstract = {Motivation: The recent explosion of interest in mining the biomedical literature for associations between defined entities such as genes, diseases and drugs has made apparent the need for robust methods of identifying occurrences of these entities in biomedical text. Such concept-based indexing is strongly dependent on the availability of a comprehensive ontology or lexicon of biomedical terms. However, such ontologies are very difficult and expensive to construct, and often require extensive manual curation to render them suitable for use by automatic indexing programs. Furthermore, the use of statistically salient noun phrases as surrogates for curated terminology is not without difficulties, due to the lack of high-quality part-of-speech taggers specific to medical nomenclature. Results: We describe a method of improving the quality of automatically extracted noun phrases by employing prior knowledge during the {HMM} training procedure for the tagger. This enhancement, when combined with appropriate training data, can greatly improve the quality and relevance of the extracted phrases, thereby enabling greater accuracy in downstream literature mining tasks. Contact: bmajoros@tigr.org},
journal = {Bioinformatics},
author = {W. H. Majoros},
year = {2003},
keywords = {Approche statistique},
pages = {402--407} },
-
M. Huisman and M. V. A. J. Duijn, "Software for statistical analysis of social networks," Connections, vol. 25, pp. 7-26, 2003.
@article{huisman_software_2003, title = {Software for statistical analysis of social networks},
volume = {25},
url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=9AE0B44C613542212FF9AA8E4F55314C?doi=10.1.1.105.9814&rep=rep1&type=pdf},
doi = {10.1.1.105.9814},
abstract = {This paper gives a state-of-the-art overview of available software for the statistical analysis of social networks as of Summer 2004. It reviews and compares software packages for social network analysis with respect to their statistical procedures, illustrating some procedures with example data. The choice of routines that were inspected is restricted to procedures for statistical modeling based on probability distributions (e.g., exponential random graph models, {QAP} correlation, statistical analysis of longitudinal network data). This definition of analysis routines excludes the extensive review of procedure-based routines based on more complex (iterative) algorithms like cluster analysis or eigendecompositions. The paper concludes with some recommendations. Key words: exponential random graph model, longitudinal network data, statistical modelling, software packages, permutation tests.},
journal = {Connections},
author = {Mark Huisman and Marijtje A. J Van Duijn},
year = {2003},
keywords = {Approche statistique, Fouille de donnée},
pages = {7--26},
annote = {{{\textless}p{\textgreater}huismanMark2003.pdf{\textless}/p{\textgreater}}} },
-
J. Reffell, M. Aydelott, and Jean-Anne, Multidimensional data analysis, 2002.
@misc{reffell_multidimensional_2002, type = {{PowerPoint}},
title = {Multidimensional data analysis},
url = {http://www2.sims.berkeley.edu/courses/is247/s02/lectures/MultidimensionalDataAnalysis.ppt},
author = {James Reffell and Moryma Aydelott and {Jean-Anne} Fitzpatrick},
month = feb, year = {2002},
keywords = {Approche statistique},
annote = {{{\textless}p{\textgreater}reffellJames2002\_1.ppt{\textless}/p{\textgreater}}} },
-
. Torres-Moreno, P. Velásquez-Morales, and Jean-Guy, "Condensés de textes par des méthodes numériques," , 2002.
@article{torres-moreno_condenss_2002, series = {6es journées internationales d’analyse statistique des données textuelles},
title = {Condensés de textes par des méthodes numériques},
abstract = {Since information in electronic form is already a standard, and that the variety and the quantity of information become increasingly large, the methods of summarizing or automatic condensation of texts is a critical phase of the analysis of texts. This article describes Cortex a system based on numerical methods, which allows obtaining a condensation of a text, which is independent of the topic and of the length of the text. The structure of the system enables it to find the abstracts in French or Spanish in very short times. Étant donné que la variété et la quantité de l’information sous forme électronique deviennent de plus en plus grandes, des méthodes d’obtention de résumés ou de condensation automatique de textes constituent une phase critique de l’analyse de textes. Cet article décrit Cortex, un système basé sur des méthodes numériques qui permet l’obtention d’un condensé d’un texte, qui est indépendant du thème, de l’ampleur du texte et de la façon dont il est écrit. La structure du système lui permet de trouver la condensation de textes multilangues dans des temps très courts. Des applications en français ou espagnol sont présentées et analysées.},
author = {{Juan-Manuel} {Torres-Moreno} and Patricia {Velásquez-Morales} and {Jean-Guy} Meunier},
year = {2002},
keywords = {Analyse de texte, Approche statistique, Catégorisation} },
-
G. Balmisse, Les réseaux bayésienskm center, 2002.
@misc{balmisse_les_2002, title = {Les réseaux bayésiens},
url = {http://www.gillesbalmisse.com/IMG/pdf/GB_RB.pdf},
abstract = {Les réseaux bayésiens ont pour objectif d’acquérir, représenter et utiliser la connaissance. Ils sont constitués de deux composantes : - un graphe causal, orienté, acyclique, dont les noeuds sont des variables d’intérêt du domaine, les arcs des relations de dépendance entre ces variables. L’ensemble des noeuds et des arcs forme ce que l’on appelle la structure du réseau bayésien. C’est la représentation qualitative de la connaissance. - un ensemble de distributions locales de probabilité qui sont les paramètres du réseau. Pour chaque noeud on dispose d’une table de probabilité P(variable/parents(variable)) qui représente la distribution locale de probabilité. Il faut remarquer que chaque noeud ne dépend que de l’état de ses parents. Il s’agit de la représentation quantitative de la connaissance},
publisher = {km center},
author = {Gilles Balmisse},
month = sep, year = {2002},
keywords = {Approche statistique},
annote = {{{\textless}p{\textgreater}balmisseGilles2002\_5.pdf{\textless}/p{\textgreater}}} },
-
H. Larochelle, "Étude de la pertinence de métriques statistiques pour la détection de termes dans un document," PhD Thesis , 2002.
@phdthesis{larochelle_tude_2002, type = {Mémoire de stage du {CRSNG}},
title = {Étude de la pertinence de métriques statistiques pour la détection de termes dans un document},
url = {http://www.iro.umontreal.ca/~felipe/Memoires/hugo.pdf},
abstract = {L'extraction terminologique est une activité de spécialistes (ces spécialistes sont appelés des terminologues) et leur expertise est nécessaire dans de nombreux domaines, notamment la traduction. De nombreux outils d'aide à l'extraction terminologique ont été proposés ou sont disponibles sur le marché. La performance de ces logiciels n'est pas toujours faciles à estimer. Hugo a travaillé sur l'implantation et la comparaison des performances d'une large gamme de mesures statistiques proposées dans la littérature.},
school = {Université de Montréal},
author = {Hugo Larochelle},
year = {2002},
keywords = {Analyse de contenu, Approche statistique, Fouille de texte},
pages = {47 p.},
annote = {{{\textless}p{\textgreater}larochelleHugo2002.pdf{\textless}/p{\textgreater}}} },
-
M. Utiyama and H. Isahara, "A statistical model for domain-independent text segmentation," in In proceeding the 39th annual meeting \& 10th conference of the european chapter of the association for the computing linguistics (ACL), Toulouse, France, 2001, pp. 491-498.
@inproceedings{utiyama_statistical_2001, address = {Toulouse, France},
title = {A statistical model for domain-independent text segmentation},
isbn = {1-55860-767-6},
doi = {10.1.1.19.9271},
abstract = {We propose a statistical method that finds the maximum-probability segmentation of a given text. This method does not require training data because it estimates probabilities from the given text. Therefore, it can be applied to any text in any domain. An experiment showed that the method is more accurate than or at least as accurate as a state-of-the-art text segmentation system.},
booktitle = {In proceeding the 39th annual meeting \& 10th conference of the european chapter of the association for the computing linguistics {(ACL)}},
publisher = {Morgan Kaufman publishers},
author = {Masao Utiyama and Hitoshi Isahara},
year = {2001},
keywords = {Approche statistique, Segmentation},
pages = {491--498},
annote = {{{\textless}p{\textgreater}utiyamaMasao2001.pdf{\textless}/p{\textgreater}}} },
-
P. Giudici, D. Heckerman, and J. Whittaker, "Statistical models for data mining," Data mining and knowledge discovery, vol. 5, iss. 3, pp. 163-165, 2001.
@article{giudici_statistical_2001, title = {Statistical models for data mining},
volume = {5},
issn = {1384-5810 {(Print)} {1573-756X} {(Online)}},
url = {http://www.springerlink.com/content/x474882357636616/fulltext.pdf},
doi = {10.1023/A:1011452614423},
abstract = {We review the background to the papers presented in this special issue and give a short introduction to each. We also briefly describe the workshop on {“Statistical} models for data mining”, held in Pavia {(Italy),} in October 2000, where the papers were presented.},
number = {3},
journal = {Data mining and knowledge discovery},
author = {Paolo Giudici and David Heckerman and Joe Whittaker},
month = jul, year = {2001},
keywords = {Approche statistique, Fouille de donnée},
pages = {163--165},
annote = {{{\textless}p{\textgreater}giudiciPaolo2001.pdf{\textless}/p{\textgreater}}} },
-
T. Hastie, R. Tibshirani, and J. H. Friedman, The elements of statistical learning : data mining, inference, and prediction, New York: Springer, 2001.
@book{hastie_elements_2001, address = {New York},
series = {Springer series in statistics},
title = {The elements of statistical learning : data mining, inference, and prediction},
isbn = {0387952845 {(ALK.} {PAPER)}},
publisher = {Springer},
author = {Trevor Hastie and Robert Tibshirani and J. H. Friedman},
year = {2001},
keywords = {Apprentissage machine, Approche statistique, Fouille de donnée} },
-
C. D. Manning and H. Schütze, Foundations of statistical natural language processing, Cambridge, Mass.: MIT Press, 1999.
@book{manning_foundations_1999, address = {Cambridge, Mass.},
title = {Foundations of statistical natural language processing},
isbn = {0262133601},
abstract = {Statistical approaches to processing natural language text have become dominant in recent years. This foundational text is the first comprehensive introduction to statistical natural language processing {(NLP)} to appear. The book contains all the theory and algorithms needed for building {NLP} tools. It provides broad but rigorous coverage of mathematical and linguistic foundations, as well as detailed discussion of statistical methods, allowing students and researchers to construct their own implementations. The book covers collocation finding, word sense disambiguation, probabilistic parsing, information retrieval, and other applications.},
publisher = {{MIT} Press},
author = {Christopher D. Manning and Hinrich Schütze},
year = {1999},
keywords = {Approche statistique, Linguistique} },
-
D. Beeferman, A. Berger, and J. Lafferty, "Statistical models for text segmentation," Machine Learning, vol. 34, iss. 1-3, p. 177, 1999.
@article{beeferman_statistical_1999, series = {Special issue on natural language learning},
title = {Statistical models for text segmentation},
volume = {34},
issn = {0885-6125},
url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=5824D69E5F041AB1F0C987DF4C4C795B?doi=10.1.1.43.9557&rep=rep1&type=pdf},
doi = {10.1.1.43.9557},
abstract = {This paper introduces a new statistical approach to automatically partitioning text into coherent segments. The approach is based on a technique that incrementally builds an exponential model to extract features that are correlated with the presence of boundaries in labeled training text. The models use two classes of features: topicality features that use adaptive language models in a novel way to detect broad changes of topic, and cue-word features that detect occurrences of specific words, which may be domain-specific, that tend to be used near segment boundaries. Assessment of our approach on quantitative and qualitative grounds demonstrates its effectiveness in two very different domains, Wall Street Journal news articles and television broadcast news story transcripts. Quantitative results on these domains are presented using a new probabilistically motivated error metric, which combines precision and recall in a natural and flexible way. This metric is used to make a quantitative assessment of the relative contributions of the different feature types, as well as a comparison with decision trees and previously proposed text segmentation algorithms.},
number = {1-3},
journal = {Machine Learning},
author = {Doug Beeferman and Adam Berger and John Lafferty},
month = feb, year = {1999},
keywords = {Approche statistique, Fouille de texte, Segmentation},
pages = {177---210},
annote = {{{\textless}p{\textgreater}beefermanDoug1999.pdf{\textless}/p{\textgreater}}} },
-
Y. Yang, "An evaluation of statistical approaches to text categorization," Information Retrieval, vol. 1, iss. 1-2, pp. 69-90, 1999.
@article{yang_evaluation_1999, title = {An evaluation of statistical approaches to text categorization},
volume = {1},
issn = {13864564},
url = {http://www.springerlink.com/content/x3n6633584015p59/},
abstract = {This paper focuses on a comparative evaluation of a wide-range of text categorization methods, including previously published results on the Reuters corpus and new results of additional experiments. A controlled study using three classifiers, {kNN,} {LLSF} and {WORD,} was conducted to examine the impact of configuration variations in five versions of Reuters on the observed performance of classifiers. Analysis and empirical evidence suggest that the evaluation results on some versions of Reuters were significantly affected by the inclusion of a large portion of unlabelled documents, mading those results difficult to interpret and leading to considerable confusions in the literature. Using the results evaluated on the other versions of Reuters which exclude the unlabelled documents, the performance of twelve methods are compared directly or indirectly. For indirect compararions, {kNN,} {LLSF} and {WORD} were used as baselines, since they were evaluated on all versions of Reuters that exclude the unlabelled documents. As a global observation, {kNN,} {LLSF} and a neural network method had the best performance\; except for a Naive Bayes approach, the other learning algorithms also performed relatively well.},
number = {1-2},
journal = {Information Retrieval},
author = {Yiming Yang},
year = {1999},
keywords = {Approche statistique, Catégorisation},
pages = {69--90},
annote = {{{\textless}p{\textgreater}yangYiming1999\_1.pdf{\textless}/p{\textgreater}}} },
-
Y. Yang, "An evaluation of statistical approaches to text categorization," , 1998.
@article{yang_evaluation_1998, title = {An evaluation of statistical approaches to text categorization},
author = {Yiming Yang},
year = {1998},
keywords = {Approche statistique, Catégorisation},
annote = {{{\textless}p{\textgreater}yangYiming1998.pdf{\textless}/p{\textgreater}}} },
-
C. J. C. Burges, "A tutorial on support vector machines for pattern recognition," Data Mining and Knowledge Discovery, vol. 2, iss. 2, pp. 121-167, 1998.
@article{burges_tutorialsupport_1998-1, title = {A tutorial on support vector machines for pattern recognition},
volume = {2},
issn = {13845810},
doi = {10.1.1.18.1083},
abstract = {The tutorial starts with an overview of the concepts of {VC} dimension and structural risk minimization. We then describe linear Support Vector Machines {(SVMs)} for separable and non-separable data, working through a non-trivial example in detail. We describe a mechanical analogy, and discuss when {SVM} solutions are unique and when they are global. We describe how support vector training can be practically implemented, and discuss in detail the kernel mapping technique which is used to construct {SVM} solutions which are nonlinear in the data. We show how Support Vector machines can have very large (even infinite) {VC} dimension by computing the {VC} dimension for homogeneous polynomial and Gaussian radial basis function kernels. While very high {VC} dimension would normally bode ill for generalization performance, and while at present there exists no theory which shows that good generalization performance is guaranteed for {SVMs,} there are several arguments which support the observed high accuracy of {SVMs,} which we review. Results of some experiments which were inspired by these arguments are also presented. We give numerous examples and proofs of most of the key theorems. There is new material, and I hope that the reader will find that even old material is cast in a fresh light.},
number = {2},
journal = {Data Mining and Knowledge Discovery},
author = {Christopher J. C. Burges},
year = {1998},
keywords = {Approche statistique},
pages = {121--167},
annote = {{{\textless}p{\textgreater}burgesChristopher1998.pdf{\textless}/p{\textgreater}}} },
-
C. J. C. Burges, "A tutorial on support vector machines for pattern recognition," Data Mining and Knowledge Discovery, vol. 2, iss. 2, pp. 121-167, 1998.
@article{burges_tutorialsupport_1998, title = {A tutorial on support vector machines for pattern recognition},
volume = {2},
issn = {13845810},
doi = {10.1023/A:1009715923555},
abstract = {The tutorial starts with an overview of the concepts of {VC} dimension and structural risk minimization. We then describe linear Support Vector Machines {(SVMs)} for separable and non-separable data, working through a non-trivial example in detail. We describe a mechanical analogy, and discuss when {SVM} solutions are unique and when they are global. We describe how support vector training can be practically implemented, and discuss in detail the kernel mapping technique which is used to construct {SVM} solutions which are nonlinear in the data. We show how Support Vector machines can have very large (even infinite) {VC} dimension by computing the {VC} dimension for homogeneous polynomial and Gaussian radial basis function kernels. While very high {VC} dimension would normally bode ill for generalization performance, and while at present there exists no theory which shows that good generalization performance is guaranteed for {SVMs,} there are several arguments which support the observed high accuracy of {SVMs,} which we review. Results of some experiments which were inspired by these arguments are also presented. We give numerous examples and proofs of most of the key theorems. There is new material, and I hope that the reader will find that even old material is cast in a fresh light.},
number = {2},
journal = {Data Mining and Knowledge Discovery},
author = {Christopher J. C. Burges},
year = {1998},
keywords = {Approche statistique},
pages = {121--167},
annote = {{{\textless}p{\textgreater}burgesChristopher1998\_1.pdf{\textless}/p{\textgreater}}} },
-
C. A. Barry, Choosing qualitative data analysis software : Atlas/ti and Nudist compared, 1998.
@misc{barry_choosing_1998, title = {Choosing qualitative data analysis software : Atlas/ti and Nudist compared},
url = {http://www.socresonline.org.uk/3/3/4.html},
abstract = {Choosing between Nudist and Atlas/ti, the main qualitative data analysis software packages can be difficult. To assist researchers in making this choice, I have conceptualised their differences along two dimensions, related to the qualities of the software and of the research project. The software dimension is structural design, and the project dimension is complexity. Software structure is dichotomised between structured, sequential, verbal versus visual, spatial, interconnected modes of operation. Projects are dichotomised between homogeneous sample, short timeframe, single data-type, single data analyst; versus multiple samples, longitudinal data, multiple data types and team data analysis. First I review the {CAQDAS} literature. Then I outline the different personalities and strengths of Atlas/ti and Nudist, and show how they match these dimensions. I offer suggestions as to how to choose, and whether to use in tandem with complementary conceptual network software.},
journal = {Sociological research online},
author = {Christine A. Barry},
year = {1998},
keywords = {Approche statistique, Fouille de donnée},
howpublished = {http://www.socresonline.org.uk/3/3/4.html} },
-
S. Thiria, Y. Lechevalier, O. Gascuel, and S. Canu, Statistiques et méthodes neuronales, Paris: Dunod, 1997.
@book{thiria_statistiques_1997, address = {Paris},
title = {Statistiques et méthodes neuronales},
isbn = {2100035444},
publisher = {Dunod},
author = {Sylvie Thiria and Yves Lechevalier and Olivier Gascuel and Stéphane Canu},
year = {1997},
keywords = {Approche statistique, Méthodologie, Réseau de neurones} },
-
N. J. Nilsson, "Statistical learning." , 1996, pp. 69-80.
@incollection{nilsson_statistical_1996, title = {Statistical learning},
url = {http://robotics.stanford.edu/~nilsson/MLDraftBook/ch5-ml.pdf},
booktitle = {Introduction to machine learning},
author = {Nils J. Nilsson},
year = {1996},
keywords = {Approche statistique},
pages = {69--80},
annote = {{{\textless}p{\textgreater}nilssonNils1996\_2.pdf{\textless}/p{\textgreater}}} },
-
L. Lebart and A. Salem, Statistique textuelle, Paris: Dunod, 1994.
@book{lebart_statistique_1994, address = {Paris},
title = {Statistique textuelle},
isbn = {2100022393},
publisher = {Dunod},
author = {Ludovic Lebart and André Salem},
year = {1994},
keywords = {Analyse de texte, Approche statistique} },
-
E. Charniak, Statistical language learning, Cambridge, Mass.: MIT Press, 1993.
@book{charniak_statistical_1993, address = {Cambridge, Mass.},
title = {Statistical language learning},
isbn = {0262032163},
publisher = {{MIT} Press},
author = {Eugene Charniak},
year = {1993},
keywords = {Approche statistique, Intelligence artificielle, Linguistique} },
-
B. Frakes and R. Baeza-Yates, "Information retrieval : data structures \& algorithms," , Englewood Cliff, N.J., 1992.
@inproceedings{frakes_information_1992, address = {Englewood Cliff, {N.J.}},
title = {Information retrieval : data structures \& algorithms},
abstract = {Information retrieval is a sub-field of computer science that deals with the automated storage and retrieval of documents. Providing the latest information retrieval techniques, this guide discusses Information Retrieval data structures and algorithms, including implementations in C. Aimed at software engineers building systems with book processing components, it provides a descriptive and evaluative explanation of storage and retrieval systems, file structures, term and query operations, document operations and hardware. Contains techniques for handling inverted files, signature files, and file organizations for optical disks. Discusses such operations as lexical analysis and stoplists, stemming algorithms, thesaurus construction, and relevance feedback and other query modification techniques. Provides information on Boolean operations, hashing algorithms, ranking algorithms and clustering algorithms. In addition to being of interest to software engineering professionals, this book will be useful to information science and library science professionals who are interested in text retrieval technology.},
publisher = {Prentice Hall},
author = {Bill Frakes and Ricardo {Baeza-Yates}},
year = {1992},
keywords = {Approche statistique, Recherche d'information} },
-
N. G. Fielding and R. M. Lee, Using computers in qualitative research, Thousand Oaks, Cal.: Sage, 1991.
@book{fielding_using_1991, address = {Thousand Oaks, Cal.},
title = {Using computers in qualitative research},
publisher = {Sage},
author = {Nigel G. Fielding and Raymond M. Lee},
year = {1991},
keywords = {Approche statistique, Méthodologie} },
-
E. Brunet, Méthodes quantitatives et informatiques dans l’étude de textes, Paris: Champion, 1986.
@book{brunet_mthodes_1986, address = {Paris},
title = {Méthodes quantitatives et informatiques dans l'étude de textes},
publisher = {Champion},
author = {E. Brunet},
year = {1986},
keywords = {Approche statistique, Fouille de texte} },
-
C. Muller, Initiation à la statistique linguistique, Paris: Larousse, 1968.
@book{muller_initiation_1968, address = {Paris},
series = {Langue et langage},
title = {Initiation à la statistique linguistique},
publisher = {Larousse},
author = {Charles Muller},
year = {1968},
keywords = {Approche statistique} },
-
H. P. Luhn, "A statistical approach to mechanized encoding and searching of literary information," IMB Journal of Research and Development, vol. 1, iss. 4, pp. 309-317, 1957.
@article{luhn_statistical_1957, title = {A statistical approach to mechanized encoding and searching of literary information},
volume = {1},
number = {4},
journal = {{IMB} Journal of Research and Development},
author = {H. P. Luhn},
year = {1957},
keywords = {Approche statistique, Recherche d'information},
pages = {309--317} },
-
E. K. Jacob and A. Loehrlein, "Information Architecture." , 2009, vol. 43, pp. 147-186.
@incollection{jacob_information_2009, title = {Information Architecture},
volume = {43},
url = {http://www.asis.org/Publications/ARIST/vol43.php},
booktitle = {Information Science and Technology},
author = {Elin K. Jacob and Aaron Loehrlein},
year = {2009},
keywords = {Architecture},
pages = {147--186} },
-
C. Wodtke and A. Govella, Information Architecture: Blueprints for the Web, 2 ed., New Riders Press, 2009.
@book{wodtke_information_2009, edition = {2},
title = {Information Architecture: Blueprints for the Web},
isbn = {0321600800},
shorttitle = {Information Architecture},
publisher = {New Riders Press},
author = {Christina Wodtke and Austin Govella},
month = feb, year = {2009},
keywords = {Architecture} },
-
P. Morville and L. Rosenfeld, Information architecture for the World Wide Web, Sebastopol, CA: O’Reilly, 2007.
@book{morville_information_2007, address = {Sebastopol, {CA}},
title = {Information architecture for the World Wide Web},
isbn = {9780596527341},
publisher = {{O'Reilly}},
author = {Peter Morville and Louis Rosenfeld},
year = {2007},
keywords = {Architecture, Design} },
-
N. Davis, G. Demetriou, R. Gaizauskas, YK, and I. Roberts, "Web service architectures for text mining : an exploration of the issues via an e-science demonstrator," International Journal of Web Services Research, vol. 3, iss. 4, pp. 95-112, 2006.
@article{davis_web_2006, title = {Web service architectures for text mining : an exploration of the issues via an e-science demonstrator},
volume = {3},
issn = {1545-7362},
shorttitle = {Web service architectures for text mining},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=11},
abstract = {Text mining technology can be used to assist in finding relevant or novel information in large volumes of unstructured data, such as that which is increasingly available in the electronic scientific literature. However publishers are not text mining specialists, nor typically are the end-user scientists who consume their products. This situation suggests a web services based solution, where text mining specialists process the literature obtained from publishers and make their results available to remote consumers (research scientists). In this paper we discuss the integration of web services and text mining within the domain of scientific publishing and explore the strengths and weaknesses of three generic architectural designs for delivering text mining web services. We argue for the superiority of one of these and demonstrate its viability by reference to an application designed to provide access to the results of text mining over the {PubMed} database of scientific abstracts.},
number = {4},
journal = {International Journal of Web Services Research},
author = {N Davis and G Demetriou and R Gaizauskas and {YK} Guo and I Roberts},
month = dec, year = {2006},
keywords = {Architecture, Bio informatic, Fouille de texte, Web},
pages = {95--112} },
-
R. Mihalcea, "Random walks on text structures." Springer, 2006, pp. 249-262.
@incollection{mihalcea_random_2006, series = {Lecture notes in computer science; 3878},
title = {Random walks on text structures},
isbn = {978-3-540-32205-4},
abstract = {Since the early ages of artificial intelligence, associative or semantic networks have been proposed as representations that enable the storage of language units and the relationships that interconnect them, allowing for a variety of inference and reasoning processes, and simulating some of the functionalities of the human mind. The symbolic structures that emerge from these representations correspond naturally to graphs – relational structures capable of encoding the meaning and structure of a cohesive text, following closely the associative or semantic memory representations. The activation or ranking of nodes in such graph structures mimics to some extent the functioning of human memory, and can be turned into a rich source of knowledge useful for several language processing applications. In this paper, we suggest a framework for the application of graph-based ranking algorithms to natural language processing, and illustrate the application of this framework to two traditionally difficult text processing tasks: word sense disambiguation and text summarization.},
booktitle = {Computational linguistics and intelligent text processing},
publisher = {Springer},
author = {Rada Mihalcea},
year = {2006},
keywords = {Analyse de texte, Architecture},
pages = {249--262},
annote = {{{\textless}p{\textgreater}mihalceaRada2006.pdf{\textless}/p{\textgreater}}} },
-
J. Couto, "Une plate-forme informatique de navigation textuelle : modélisation, architecture, réalisation et application de NaviTexte," PhD Thesis , 2006.
@phdthesis{couto_une_2006, type = {Thèse de doctorat},
title = {Une plate-forme informatique de navigation textuelle : modélisation, architecture, réalisation et application de {NaviTexte}},
url = {http://www.paris-sorbonne.fr/fr/article.php3?id_article=3926},
abstract = {Au long de l’histoire, des instruments de recherche d’information ou d’aide à la lecture, fondés sur la notion de page, tels que la table des matières, les index, les renvoi, etc., ont été introduits. Dès l’arri-vée de l’informatique, ces instruments se sont multipliés et ils ont vu croître leur puissance. Dans le cadre général de l’histoire du texte numérique, l’hypertexte place un jalon du point de vue conceptuel, et l’utilisation massive {d’Internet} a répandu son utilisation à grande échelle. Le terme de navigation textuelle reçoit de multiples interprétations, la plus commune renvoyant au processus mis en œuvre par les outils de navigation utilisés pour circuler dans les documents hypertextes. Néanmoins, notre conception de la navigation textuelle se démarque de la navigation hypertextuelle traditionnelle car nous considérons que circuler ou naviguer dans un texte est l’expression d’un processus cognitif qui convoque des connaissances qui sont propres à la finalité de la navigation. Nous formulons l’hypothèse que la démarche du lecteur peut être assistée par l’exploitation de connaissances, présen-tes dans les textes, qui peuvent être, en partie, modélisées sous une forme déclarative. Autrement dit, il ne suffit pas de créer des liens mais il est nécessaire d’expliciter l’opération de navigation. De plus, ce processus de définition d’opérations de navigation doit être mis en œuvre par un « expert » capable d’encoder ces connaissances. Ce travail présente quatre contributions principales. En premier lieu, une représentation des textes spécifique à la navigation textuelle est définie. En deuxième lieu, un langage formel de modélisation des connaissances de visualisation et de navigation, nommé Sextant est proposé. Les constructions possibles du langage sont données par une syntaxe. Le sens des constructions syntaxi-ques du langage Sextant est déterminé par une sémantique opérationnelle. En troisième lieu, une plate-forme logicielle dédiée à la navigation textuelle, nommée {NaviTexte,} a été implémentée. En dernier lieu, di-verses applications de la plate-forme logicielle {NaviTexte} à des cas réels d’utilisation ont été mises en œuvre.},
school = {{Paris-Sorbonne}},
author = {Javier Couto},
year = {2006},
keywords = {Architecture, Informatique},
annote = {{{\textless}p{\textgreater}coutoJavier2006.pdf{\textless}/p{\textgreater}}} },
-
D. Ferrucci and A. Lally, "Building an example application with the unstructured information management architecture," IBM Systems Journal. Unstructured Information Management, vol. 43, iss. 3, pp. 455-475, 2004.
@article{ferrucci_buildingexample_2004, title = {Building an example application with the unstructured information management architecture},
volume = {43},
abstract = {{IBM's} Unstructured Information Management Architecture {(UIMA)} is a software architecture for developing and deploying unstructured information management {(UIM)} applications. In this paper we provide a high-level overview of the architecture, introduce its basic components, and describe the set of tools that constitute a {UIMA} development framework. Then we take the reader through the steps involved in building a simple {UIM} application, thus highlighting the major {UIMA} concepts and techniques.},
number = {3},
journal = {{IBM} Systems Journal. Unstructured Information Management},
author = {D. Ferrucci and A. Lally},
year = {2004},
keywords = {Architecture},
pages = {455--475},
annote = {{{\textless}p{\textgreater}ferrucciD2004.pdf{\textless}/p{\textgreater}}} },
-
A. J. Warner, "Metadata and taxonomies for a more flexible information architecture," , 2002.
@article{warner_metadata_2002, series = {Third annual information architecture summit},
title = {Metadata and taxonomies for a more flexible information architecture},
abstract = {This presentation will describe a methodology for developing customized taxonomies and metadata schema for a collection and its users. The methodology takes into account both the basic indexable aspects of content objects and the ways that a particular group of users tends to search for them. The presentation will go on to illustrate how an information architecture based on taxonomies and metadata can be used to make a number of basic website and Intranet functions more flexible and dynamic. These include navigation with customized metadata-driven indexes; transparent or user-specified search combinations; personalized retrieval and filtering using various aspects of user profiles and content objects; and content management rules that can be customized for many different types of content objects.},
author = {Amy J. Warner},
year = {2002},
keywords = {Architecture, Taxonomie} },
-
The first computers : history and architectures, Rojas, R. and Hashagen, U., Eds., MIT Press, 2000.
@book{rojas_first_2000, title = {The first computers : history and architectures},
isbn = {0-262-18197-5},
shorttitle = {The first computers},
publisher = {{MIT} Press},
editor = {Raúl Rojas and Ulf Hashagen},
year = {2000},
keywords = {Architecture, Informatique},
annote = {{{\textless}p{\textgreater}rojasRaul200.pdf{\textless}/p{\textgreater}}} },
-
G. L. Heileman, M. Georgiopoulos, and C. Abdallah, "A dynamical adaptive resonance architecture," IEEE transactions on neural networks, vol. 5, iss. 6, pp. 873-889, 1994.
@article{heileman_dynamical_1994, title = {A dynamical adaptive resonance architecture},
volume = {5},
issn = {1045-9227},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.50.8939},
doi = {10.1.1.50.8939},
abstract = {A set of nonlinear differential equations that describe the dynamics of the {ART1} model are presented, along with the motivation for their use. These equations are extensions of those developed by Carpenter and Grossberg (1987). It is shown how these differential equations allow the {ART1} model to be realized as a collective nonlinear dynamical system. Specifically, we present an {ART1-based} neural network model whose description requires no external control features. That is, the dynamics of the model are completely determined by the set of coupled differential equations that comprise the model. It is shown analytically how the parameters of this model can be selected so as to guarantee a behavior equivalent to that of {ART1} in both fast and slow learning scenarios. Simulations are performed in which the trajectories of node and weight activities are determined using numerical approximation techniques},
number = {6},
journal = {{IEEE} transactions on neural networks},
author = {Gregory L. Heileman and Micheal Georgiopoulos and Chaouki Abdallah},
year = {1994},
keywords = {Architecture},
pages = {873--889},
annote = {{{\textless}p{\textgreater}heilemanGregory1994.pdf{\textless}/p{\textgreater}}} },
-
L. V. Fausett, "What is a neural net?." Englewood Cliffs, NJ: Prentice-Hall, 1994, pp. 3-7.
@incollection{fausett_what_1994, address = {Englewood Cliffs, {NJ}},
title = {What is a neural net?},
isbn = {0133341860},
booktitle = {Fundamentals of neural networks : architectures, algorithms, and applications},
publisher = {{Prentice-Hall}},
author = {Laurene V. Fausett},
year = {1994},
keywords = {Architecture, Réseau de neurones},
pages = {3--7},
annote = {{{\textless}p{\textgreater}Amélie} Gariépy (20081221): Création de la {notice{\textless}/p{\textgreater}{\textless}p{\textgreater}Source} de la notice: Notice créée à partir du document papier et {de{\textless}/p{\textgreater}{\textless}p{\textgreater}(http://www.amazon.fr/Statistiques-m\%C3\%A9thodes-neuronales-\%C3\%A9coles-dingenieurs/dp/2100035444){\textless}/p{\textgreater}{\textless}p{\textgreater}\ {\textless}/p{\textgreater}{\textless}p{\textgreater}Vérification} dans {BNC} {{\textless}/p{\textgreater}{\textless}p{\textgreater}\ {\textless}/p{\textgreater}{\textless}p{\textgreater}Pas} de {PDF{\textless}/p{\textgreater}}} },
-
M. A. Hearst, "Context and structure in automated full-text information access," PhD Thesis , 1994.
@phdthesis{hearst_context_1994, title = {Context and structure in automated full-text information access},
url = {http://citeseer.ist.psu.edu/rd/47088868%2C11907%2C1%2C0.25%2CDownload/http://citeseer.ist.psu.edu/cache/papers/cs/1941/ftp:zSzzSzparcftp.xerox.comzSzpubzSzhearstzSzphdthesis.pdf/hearst94context.pdf},
abstract = {This dissertation investigates the role of contextual information in the automated retrieval and display of full-text documents, using robust natural language processing algorithms to automatically detect structure in and assign topic labels to texts. Many long texts are comprised of complex topic and subtopic structure, a fact ignored by existing information access methods. I present two algorithms which detect such structure, and two visual display paradigms which use the results of these algorithms to show the interactions of multiple main topics, multiple subtopics, and the relations between main topics and subtopics. The first algorithm, called {{\textbackslash}it {TextTiling},
} recognizes the subtopic structure of texts as dictated by their content. It uses domain-independent lexical frequency and distribution information to partition texts into multi-paragraph passages. The results are found to correspond well to reader judgments of major subtopic boundaries. The second algorithm assigns multiple main topic labels to each text, where the labels are chosen from pre-defined, intuitive category sets; the algorithm is trained on unlabeled text. A new iconic representation, called {{\textbackslash}it {TileBars}} uses {TextTiles} to simultaneously and compactly display query term frequency, query term distribution and relative document length. This representation provides an informative alternative to ranking long texts according to their overall similarity to a query. For example, a user can choose to view those documents that have an extended discussion of one set of terms and a brief but overlapping discussion of a second set of terms. This representation also allows for relevance feedback on patterns of term distribution. {TileBars} display documents only in terms of words supplied in the user query. For a given retrieved text, if the query words do not correspond to its main topics, the user cannot discern in what context the query terms were used. For example, a query on {{\textbackslash}sl contaminants} may retrieve documents whose main topics relate to nuclear power, food, or oil spills. To address this issue, I describe a graphical interface, called {{\textbackslash}it Cougar},
that displays retrieved documents in terms of interactions among their automatically-assigned main topics, thus allowing users to familiarize themselves with the topics and terminology of a text collection.},
author = {Marti A. Hearst},
year = {1994},
keywords = {Architecture, Recherche d'information},
annote = {{{\textless}p{\textgreater}hearstMarti1994.pdf{\textless}/p{\textgreater}}
}
-
G. A. Carpenter and S. Grossberg, "ART 3 : hierarchical search using chemical transmitters in self-organizing pattern recognition architectures." Cambridge, Mass.: MIT Press, 1991, pp. 451-499.
@incollection{carpenter_art_1991, address = {Cambridge, Mass.},
title = {{ART} 3 : hierarchical search using chemical transmitters in self-organizing pattern recognition architectures},
isbn = {0262031760},
url = {http://books.google.com/books?id=2u1fH0mxfz0C&dq=Pattern+recognition+by+self-organizing+neural+networks&printsec=frontcover&source=bn&hl=fr&sa=X&oi=book_result&resnum=4&ct=result#PPP13,M1},
booktitle = {Pattern recognition by self-organizing neural networks},
publisher = {{MIT} Press},
author = {Gail A. Carpenter and Stephen Grossberg},
year = {1991},
keywords = {Architecture, Informatique},
pages = {451--499} },
-
Jean-Guy, "La structure génétique des systèmes sémiotiques," Recherches sémiotiques, vol. 8, pp. 75-107, 1988.
@article{meunier_la_1988, title = {La structure génétique des systèmes sémiotiques},
volume = {8},
journal = {Recherches sémiotiques},
author = {{Jean-Guy} Meunier},
year = {1988},
keywords = {Architecture},
pages = {75--107} },
-
G. A. Carpenter and S. Grossberg, "A massively parallel architecture for a self-organizing neural pattern recognition machine," Computer Vision, Graphics, and Image Processing, vol. 37, iss. 1, pp. 54-115, 1987.
@article{carpenter_massively_1987, title = {A massively parallel architecture for a self-organizing neural pattern recognition machine},
volume = {37},
number = {1},
journal = {Computer Vision, Graphics, and Image Processing},
author = {Gail A. Carpenter and Stephen Grossberg},
year = {1987},
keywords = {Architecture},
pages = {54--115} },
-
R. Sanderson and P. Watry, "Integrating data and text mining processes for digital library applications," , Vancouver, BC, Canada, 2007, pp. 73-79.
@inproceedings{sanderson_integrating_2007, address = {Vancouver, {BC,} Canada},
title = {Integrating data and text mining processes for digital library applications},
isbn = {978-1-59593-644-8},
url = {http://portal.acm.org/ft_gateway.cfm?id=1255188&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1255175.1255188},
abstract = {This paper explores the integration of text mining and data mining techniques, digital library systems, and computational and data grid technologies with the objective of developing an online classification service exemplar. We discuss the current research issues relating to the use of data mining algorithms and toolkits for textual data; the necessary changes within the Cheshire3 Information Framework to accommodate analysis workflows; the outcomes of a demonstrator based on the National Library of Medicine's Medline dataset; and the provision of comparable metrics for evaluation purposes. The prototype has resulted in extremely accurate online classification services and offers a novel method of supporting text mining and data mining within a highly scaled computational environment, integrated seamlessly into the digital library architecture.},
publisher = {{ACM}},
author = {Robert Sanderson and Paul Watry},
year = {2007},
keywords = {Bibliothèque numérique, Fouille de donnée, Fouille de texte},
pages = {73--79},
annote = {{{\textless}p{\textgreater}sandersonRobert2007.pdf{\textless}/p{\textgreater}}} },
-
H. Ding and I. Sølvberg, "Rule-based metadata interoperation in heterogeneous digital libraries," The Electronic Library, vol. 25, iss. 2, pp. 193-206, 2007.
@article{ding_rule-based_2007, title = {Rule-based metadata interoperation in heterogeneous digital libraries},
volume = {25},
url = {http://thesius.emeraldinsight.com/10.1108/02640470710741322},
abstract = {Purpose – The purpose of this research is to describe a system to support querying across distributed digital libraries created in heterogeneous metadata schemas, without requiring the availability of a global schema. Design/methodology/approach – The advantages and weaknesses of ontology based applications were investigated and have justified the utility of inferential rules in expressing complex relations between metadata terms in different metadata schemas. A process for combining ontologies and rules for specifying complex relations between metadata schemas were designed. The process was collapsed into a set of working phases and provides examples to illustrate how to interrelate two similar bibliographic ontology fragments for further query reformulation. Findings – Equipping ontologies with inferencing power can help describe more complex relations between metadata terms. This approach is critical for properly interpreting queries from one ontology to another. Research limitations/implications – A prototype system was built based on examples instead of practical experience. Practical implications – The approach assumes that relations between metadata sets, or ontologies in the approach, are provided by domain experts with/without ontology tools. Originality/value – A new approach has been proposed for facilitating heterogeneous metadata interoperation in digital libraries as a way of empowering ontologies with rich reasoning capabilities. The traditional approach assumes a global schema controlled by a central or virtual server to provide mapping between local and external metadata schemas. A more flexible and dynamic environment was studied, i.e. {P2P-based} digital libraries, where peers may join and leave freely.},
number = {2},
journal = {The Electronic Library},
author = {Hao Ding and Ingeborg Sølvberg},
year = {2007},
keywords = {Bibliothèque numérique},
pages = {193 -- 206} },
-
A. Zuccala, M. Thelwall, C. Oppenheim, and R. Dhiensa, "Web intelligence analyses of digital libraries : a case study of the National electronic Library for Health (NeLH)," Journal of Documentation, vol. 63, iss. 4, pp. 558-589, 2007.
@article{alesia_zuccala_web_2007, title = {Web intelligence analyses of digital libraries : a case study of the National electronic Library for Health {(NeLH)}},
volume = {63},
issn = {0022-0418},
url = {http://www.emeraldinsight.com/10.1108/00220410710759011},
abstract = {Purpose – The purpose of this paper is to explore the use of {LexiURL} as a Web intelligence tool for collecting and analysing links to digital libraries, focusing specifically on the National electronic Library for Health {(NeLH).} Design/methodology/approach – The Web intelligence techniques in this study are a combination of link analysis (web structure mining), web server log file analysis (web usage mining), and text analysis (web content mining), utilizing the power of commercial search engines and drawing upon the information science fields of bibliometrics and webometrics. {LexiURL} is a computer program designed to calculate summary statistics for lists of links or {URLs.} Its output is a series of standard reports, for example listing and counting all of the different domain names in the data. Findings – Link data, when analysed together with user transaction log files (i.e. Web referring domains) can provide insights into who is using a digital library and when, and who could be using the digital library if they are “surfing” a particular part of the Web; in this case any site that is linked to or colinked with the {NeLH.} This study found that the {NeLH} was embedded in a multifaceted Web context, including many governmental, educational, commercial and organisational sites, with the most interesting being sites from the.edu domain, representing American Universities. Not many links directed to the {NeLH} were followed on September 25, 2005 (the date of the log file analysis and link extraction analysis), which means that users who access the digital library have been arriving at the site via only a few select links, bookmarks and search engine searches, or non-electronic sources. Originality/value – A number of studies concerning digital library users have been carried out using log file analysis as a research tool. Log files focus on real-time user transactions; while {LexiURL} can be used to extract links and colinks associated with a digital library's growing Web network. This Web network is not recognized often enough, and can be a useful indication of where potential users are surfing, even if they have not yet specifically visited the {NeLH} site.},
number = {4},
journal = {Journal of Documentation},
author = {Alesia Zuccala and Mike Thelwall and Charles Oppenheim and Rajveen Dhiensa},
year = {2007},
keywords = {Bibliothèque numérique, Web},
pages = {558 -- 589} },
-
J. Gelernter, "Visual Classification with Information Visualization (Infoviz) for Digital Library Collections," Knowl. Org, vol. 34, iss. 3, 2007.
@article{gelernter_visual_2007, title = {Visual Classification with Information Visualization {(Infoviz)} for Digital Library Collections},
volume = {34},
url = {http://www.cs.cmu.edu/afs/cs/Web/People/gelernter/classification.pdf},
number = {3},
journal = {Knowl. Org},
author = {Judith Gelernter},
year = {2007},
keywords = {Bibliothèque numérique, Visualisation de l'information} },
-
. WEI, . . HU, . TAI, . HUANG, and . YANG, "Managing word mismatch problems in information retrieval : a topic-based query expansion approach," Journal of Management Information Systems, vol. 24, iss. 3, pp. 269-295, 2007.
@article{chih-ping_wei_managing_2007, title = {Managing word mismatch problems in information retrieval : a topic-based query expansion approach},
volume = {24},
issn = {07421222},
shorttitle = {Managing Word Mismatch Problems in Information Retrieval},
url = {http://search.ebscohost.com/login.aspx?direct=true&db=buh&AN=28857214&site=ehost-live},
doi = {Article},
abstract = {Word mismatch represents a fundamental information retrieval challenge that has become increasingly important as electronic document repositories (e.g., Web resources, digital libraries) grow in number and sheer volume. In general, word mismatch refers to the phenomenon in which a concept is described by different terms in user queries and in source documents. Query expansion represents a promising avenue to address such problems. Previous research predominantly approaches query expansion on the basis of global or local analysis. However, these approaches emphasize a global perspective rather than taking a topic-specific view of term associations. As a consequence, their effectiveness can be severely constrained when the document corpus spans a diverse set of topics. In this study, we propose a topic-based approach for query expansion and develop and empirically evaluate two novel methods--namely, nonfuzzy and fuzzy topic-based query expansion--to address word mismatch problems. According to our evaluation results, the proposed topic-based approach is more effective than a benchmark global analysis method, particularly when user queries consist of multiple query terms. {ABSTRACT} {FROM} {AUTHOR} Copyright of Journal of Management Information Systems is the property of {M.E.} Sharpe Inc. and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {3},
journal = {Journal of Management Information Systems},
author = {{CHIH-PING} {WEI} and {PAUL} {JEN-HWA} {HU} and {CHIA-HUNG} {TAI} and {CHUN-NENG} {HUANG} and {CHIN-SHENG} {YANG}},
year = {2007},
keywords = {Bibliothèque numérique, Cluster, Fouille de texte, Recherche d'information},
pages = {269--295},
annote = {{{\textless}p{\textgreater}Accession} Number: 28857214; {CHIH-PING} {WEI} 1; {JEN-HWA} {HU,} {PAUL} 2; {CHIA-HUNG} {TAI} 3; {CHUN-NENG} {HUANG} 4; {CHIN-SHENG} {YANG} 5; Affiliations: 1: Professor, Institute of Technology Management, National Tsing Hua University, Taiwan; 2: Associate Professor and David Eccles Faculty Fellow, David Eccles School of Business, University of Utah; 3: Research Assistant, Institute of Information Science, Academia Sinica, Taiwan; 4: Project Manager Engineer, Asiatek Taiwan; 5: Second Lieutenant, Republic of China Army; Issue Info: Winter2007/2008, Vol. 24 Issue 3, p269; Thesaurus Term: {INFORMATION} technology; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {MANAGEMENT;} Subject Term: {DOCUMENT} clustering; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {DIGITAL} libraries; {NAICS/Industry} Codes: 519120 Libraries and Archives; Number of Pages: 27p; Illustrations: 4 charts, 1 diagram, 3 graphs; Document Type: Article{\textless}/p{\textgreater}} },
-
M. N. Uddin and P. Janecek, "Faceted classification in web information architecture : a framework for using semantic Web tools," The Electronic Library, vol. 25, iss. 2, pp. 219-233, 2007.
@article{uddin_faceted_2007, title = {Faceted classification in web information architecture : a framework for using semantic Web tools},
volume = {25},
url = {http://thesius.emeraldinsight.com/10.1108/02640470710741340},
abstract = {Purpose – The purpose of this paper is to develop and implement a faceted classification structure to improve web information organization, access and navigability. Design/methodology/approach – Some case studies of commercial web sites using faceted metadata were analyzed to develop the classification approach. The proposed framework adapts the facet analysis theory from Faceted Classification System {(FCS)} to use semantic web tools especially {XML} and {RDF} store, and ontology, and is designed to be integrated within a Content Management System {(CMS).} A detailed example of a faceted classification system for an academic information system is used to demonstrate the construction of an {FCS} from metadata. Findings – Detailed examples show how classifying and organizing information in multidimensional hierarchies is more accessible than simple one-dimensional taxonomic hierarchies. Research limitations/implications – A prototype, based on the proposed framework, is being developed using the web site of an academic institution as a case study. Originality/value – Enhances the {FCS} research with a notion of integrating ontology driven faceted classification structures by {XML/RDF} language and content management tools. A dynamic approach to organizing and searching web information provides users with multiple ways of accessing information based on their knowledge and information needs.},
number = {2},
journal = {The Electronic Library},
author = {Mohammad Nasir Uddin and Paul Janecek},
year = {2007},
keywords = {Bibliothèque numérique, Classification},
pages = {219 -- 233} },
-
R. Schult and M. Spiliopoulou, "Discovering emerging topics in unlabelled text collections," in Advances in databases and information systems : 10th East European conference, ADBIS 2006, Thessaloniki, Greece, september 3-7, 2006 : proceedings, 2006, pp. 353-66.
@inproceedings{schult_discovering_2006, series = {Lecture notes in computer sciences; 4152},
title = {Discovering emerging topics in unlabelled text collections},
abstract = {As document collections accumulate over time, some of the discussion subjects in them become outfashioned, while new ones emerge. Then, old classification schemes should be updated. In this paper, we address the challenge of finding emerging and persistent "themes", i.e. subjects that live long enough to be incorporated into a taxonomy or ontology describing the document collection. We focus on the identification of cluster labels that "survive" changes in the constitution of the underlying population of documents, including changes in the feature space of dominant words, because the terminology of the document archive also changes over time. We have conducted a set of promising experiments on the identification of themes that manifested themselves in section H2.8 of the {ACM} digital library and juxtapose them with the classes foreseen in the {ACM} taxonomy for this section},
booktitle = {Advances in databases and information systems : 10th East European conference, {ADBIS} 2006, Thessaloniki, Greece, september 3-7, 2006 : proceedings},
publisher = {{Springer-Verlag}},
author = {R. Schult and M. Spiliopoulou},
year = {2006},
note = {Copyright 2007, The Institution of Engineering and Technology},
keywords = {Bibliothèque numérique, Classification, Recherche d'information},
pages = {353--66},
annote = {{\textless}p{\textgreater}9229827 unlabelled text collections document collections document archive {ACM} digital library classification{\textless}/p{\textgreater}} },
-
Chan-Chine and Ruey-Shun, "Using data mining technology to solve classification problems : a case study of campus digital library," The Electronic Library, vol. 24, iss. 3, pp. 307-321, 2006.
@article{chan-chine_chang_using_2006, title = {Using data mining technology to solve classification problems : a case study of campus digital library},
volume = {24},
issn = {0264-0473},
url = {http://www.emeraldinsight.com/10.1108/02640470610671178},
abstract = {Purpose – Traditional library catalogs have become inefficient and inconvenient in assisting library users. Readers may spend a lot of time searching library materials via printed catalogs. Readers need an intelligent and innovative solution to overcome this problem. The paper seeks to examine data mining technology which is a good approach to fulfill readers' requirements. Design/methodology/approach – Data mining is considered to be the non-trivial extraction of implicit, previously unknown, and potentially useful information from data. This paper analyzes readers' borrowing records using the techniques of data analysis, building a data warehouse, and data mining. Findings – The paper finds that after mining data, readers can be classified into different groups according to the publications in which they are interested. Some people on the campus also have a greater preference for multimedia data. Originality/value – The data mining results shows that all readers can be categorized into five clusters, and each cluster has its own characteristics. The frequency with which graduates and associate researchers borrow multimedia data is much higher. This phenomenon shows that these readers have a higher preference for accepting digitized publications. Also, the number of readers borrowing multimedia data has increased over the years. This trend indicates that readers preferences are gradually shifting towards reading digital publications.},
number = {3},
journal = {The Electronic Library},
author = {{Chan-Chine} Chang and {Ruey-Shun} Chen},
year = {2006},
keywords = {Bibliothèque numérique, Classification, Fouille de donnée},
pages = {307 -- 321} },
-
F. Papy, Les bibliothèques numériques, Paris: Hermès science publications : Lavoisier, 2005.
@book{papy_les_2005, address = {Paris},
title = {Les bibliothèques numériques},
isbn = {2746210363},
abstract = {Les technologies du numérique ont investi l'espace séculaire de la bibliothèque en y introduisant d'une part de nouvelles ressources (numériques et numérisées), et en proposant d'autre part des modalités de recherche et de consultation faisant une large place aux Technologies de {l'Information} et de la Communication. Cette évolution technologique, en modifiant l'environnement documentaire, provoque une transformation des usages et des pratiques, et entraîne une modification du rôle des bibliothécaires et documentalistes. Cet ouvrage collectif grâce aux contributions de chercheurs et de professionnels de l'information et des bibliothèques, apporte des éléments de réponse pour mieux appréhender les mutations amorcées et celles qui se dessinent à court terme. Table des matières : L'évolution du rôle des bibliothécaires et documentalistes dans le cadre de la bibliothèque numérique {-Ch.} Lupovici. Le Tao de la bibliothèque numérique : bibliothèque sans bibliothécaire ? {-J.} Schopfel, J. Creusot. L'usager face à la bibliothèque numérique : l'expérience du portail d'information scientifique de {l'Institut} Pasteur {-E.} {Jannès-Ober.} L'esprit du numérique : bibliothèques numériques et démocratie {-O.} Fressard. L'accès aux catalogues des bibliothèques à l'âge des bibliothèques numériques et des moteurs de recherche : écarts, perturbations, mutation ? {-D.} Lahary. Pour une approche visuelle et ergonomique dans la recherche et l'exploration d'informations au sein d'un {OPAC} de {SCD.} L'exemple du {Visual…Catalog} {-F.} Papy, S. Chauvin. Interaction {3D} pour les bibliothèques numériques {-P.} Cubaud. Accès thématique en bibliothèque numérique : le rôle du langage documentaire de type thésaurus {-M.} Hudon. Les bibliothèques numériques : l'édition juridique online au service de l'infomédiation {-F.} Girard De Barros. Quelle offre éditoriale numérique pour l'usager virtuel des universités françaises ? {-G.} Chartron, M. Minon. {Revel@NiceProjet,} réalisation et perspectives d'un site institutionnel de périodiques électroniques {-M.} Roland. Bibliographies. Index.},
publisher = {Hermès science publications : Lavoisier},
author = {Fabrice Papy},
year = {2005},
keywords = {Bibliothèque numérique} },
-
P. Haase, J. Völker, and Y. Sure, "Management of dynamic knowledge," Journal of Knowledge Management, vol. 2005, iss. 5, pp. 97-107, 2005.
@article{haase_management_2005, title = {Management of dynamic knowledge},
volume = {2005},
url = {http://thesius.emeraldinsight.com/10.1108/13673270510622483},
abstract = {Purpose – This paper presents a framework for ontology evolution tailored to Digital Libraries, which makes use of two different sources for change detection and propagation, the usage of ontologies by users and the changes of available data. Design/methodology/approach – After presenting the logical architecture of the evolution framework, we first illustrate how to deal with usage-driven changes, that is changes derived from the actual usage of ontologies. Second, we describe the generation of data-driven ontology changes based on the constant flow of documents coming into digital libraries. Findings – The proposed framework for ontology ontology evolution, which is currently applied and evaluated in the case studies, significantly reduces the costs of ontology updates and improves the quality of the ontology with respect to the users' requirements. Practical implications – The management of dynamic knowledge is crucial for many knowledge management applications. Our approach for usage-driven and data-driven change discovery not only assures the consistency of ontologies modeling dynamic knowledge, but also reduces the burden of manual ontology engineering. Originality/value – This paper presents the first approach towards a common framework for ontology evolution based on usage-driven and data-driven change discovery.},
number = {5},
journal = {Journal of Knowledge Management},
author = {Peter Haase and Johanna Völker and York Sure},
year = {2005},
keywords = {Bibliothèque numérique, Gestion des connaissances, Recherche d'information},
pages = {97 -- 107} },
-
C. Lagoze, D. B. Krafft, S. Payette, and S. Jesuroga, What Is a digital library anyway? Beyond search and access in the NSDL, 2005.
@misc{lagoze_what_2005, type = {Magazine},
title = {What Is a digital library anyway? Beyond search and access in the {NSDL}},
url = {http://www.dlib.org/dlib/november05/lagoze/11lagoze.html},
journal = {{D-Lib} Magazine},
author = {Carl Lagoze and Dean B. Krafft and Sandy Payette and Susan Jesuroga},
month = nov, year = {2005},
keywords = {Bibliothèque numérique, Recherche d'information},
howpublished = {http://www.dlib.org/dlib/november05/lagoze/11lagoze.html},
annote = {{{\textless}p{\textgreater}Amélie} Gariépy (20090103) : Données vérifiées à partir du document {papier{\textless}/p{\textgreater}{\textless}p{\textgreater}Création} de la {notice{\textless}/p{\textgreater}{\textless}p{\textgreater}ISSN} 10829873{\textless}/p{\textgreater}
}
-
N. Ferran, E. Mor, and J. Minguillón, "Towards personalization in digital libraries through ontologies," Library Management, vol. 26, iss. 4/5, pp. 206-217, 2005.
@article{ferran_towards_2005, title = {Towards personalization in digital libraries through ontologies},
volume = {26},
url = {http://thesius.emeraldinsight.com/10.1108/01435120510596062},
abstract = {Purpose – To describe a browsing and searching personalization system for digital libraries based on the use of ontologies for describing the relationships between all the elements which take part in a digital library scenario of use. Design/methodology/approach – Identification of all the desired functionalities and requirements that are necessary to fully integrate the use of a digital library in an e-learning environment, and the basic elements that are used to build the ontology that describes such scenario. Findings – The elements that determine the functionalities of the desired personalization system: first, the user's profile, including navigational history and user preferences; and second, the information collected from the navigational behavior of the digital library users. Research limitations/implications – The ontology is not complete. In fact, the ontology in itself will evolve with the new apparition of desired functionalities and requirements of the personalization system. Practical implications – Such a personalization system will be very helpful to the users of a digital library to improve their experience of use. Originality/value – The use of ontologies promotes the integration of new services into existing ones, and the interoperability with other systems through the appropriate semantic web services. New system functionalities and requirements can be added by including the appropriate description into the ontology framework that defines the digital library scenario of use.},
number = {4/5},
journal = {Library Management},
author = {Núria Ferran and Enric Mor and Julià Minguillón},
year = {2005},
keywords = {Bibliothèque numérique, Recherche d'information},
pages = {206 -- 217},
annote = {{{\textless}p{\textgreater}Personnalisation} Application ontologie Construction ontologie{\textless}/p{\textgreater}} },
-
I. H. Witten, K. J. Don, M. Dewsnip, and V. Tablan, "Text mining in a digital library," International Journal on Digital Libraries, vol. 4, iss. 1, pp. 56-59, 2004.
@article{witten_text_2004, title = {Text mining in a digital library},
volume = {4},
url = {http://www.springerlink.com/content/uuuv5md0gm8clrmw/fulltext.pdf},
doi = {10.1007/s00799-003-0066-4},
number = {1},
journal = {International Journal on Digital Libraries},
author = {Ian H. Witten and Katherine J. Don and Michael Dewsnip and Valentin Tablan},
year = {2004},
keywords = {Bibliothèque numérique, Fouille de texte},
pages = {56--59},
annote = {{{\textless}p{\textgreater}wittenIan2004.pdf{\textless}/p{\textgreater}}} },
-
McCulloch, "Multiple terminologies : an obstacle to information retrieval," Library Review, vol. 53, iss. 6, pp. 297-300, 2004.
@article{emma_multiple_2004, title = {Multiple terminologies : an obstacle to information retrieval},
volume = {53},
abstract = {An issue currently at the forefront of digital library research is the prevalence of disparate terminologies and the associated limitations imposed on user searching. It is thought that semantic interoperability is achievable by improving the compatibility between terminologies and classification schemes, enabling users to search multiple resources simultaneously and improve retrieval effectiveness through the use of associated terms drawn from several schemes. This column considers the terminology issue before outlining various proposed methods of tackling it, with a particular focus on terminology mapping.},
number = {6},
journal = {Library Review},
author = {{McCulloch} Emma},
year = {2004},
keywords = {Bibliothèque numérique, Classification, Recherche d'information},
pages = {297--300} },
-
H. Avancini, A. Rauber, and F. Sebastiani, Organizing digital libraries by automated text categorization, 2002.
@misc{avancini_organizing_2002, title = {Organizing digital libraries by automated text categorization},
url = {http://dienst.isti.cnr.it/Dienst/Repository/2.0/Body/ercim.cnr.iei/2002-TR-05/pdf?tiposearch=cnr&langver=},
abstract = {Text Categorization {(TC)} is the discipline concerned with the construction of automatic text classifiers, i.e. programs capable of assigning to a document one or more among a set of predefined categories based on the content of the document. Building these classifiers is itself done automatically, by means of a general inductive process that learns the characteristics of the categories from a set of preclassified documents. In this paper we discuss a class of applications, automatic indexing with controlled vocabularies, that is of direct concern to organizing digital libraries. We exemplify this class of applications by discussing an ongoing project aimed at classifying scientific papers about computer science with respect to the {ACM} Classification Scheme.},
author = {Henri Avancini and Andreas Rauber and Fabrizio Sebastiani},
month = may, year = {2002},
keywords = {Bibliothèque numérique, Catégorisation, Classification},
annote = {{{\textless}p{\textgreater}avanciniHenri2002.pdf{\textless}/p{\textgreater}}} },
-
J. Hynek, "Document classification in a digital library," University of West Bohemia in Pilsen, Czech Republic, Technical report DCSE/TR-2002-04, 2002.
@techreport{hynek_document_2002, address = {Czech Republic},
type = {Technical report},
title = {Document classification in a digital library},
url = {http://www.kiv.zcu.cz/vyzkum/publikace/technicke-zpravy/2002/tr-2002-04.pdf},
number = {{DCSE/TR-2002-04}},
institution = {University of West Bohemia in Pilsen},
author = {Jiří Hynek},
month = feb, year = {2002},
keywords = {Bibliothèque numérique, Classification},
pages = {42 p.},
annote = {{{\textless}p{\textgreater}hynekJiri2002.pdf{\textless}/p{\textgreater}}} },
-
F. Lundh, Python standard library, O’Reilly Media, 2001.
@book{lundh_python_2001, title = {Python standard library},
url = {http://proquest.safaribooksonline.com/0596000960},
abstract = {Python Standard Library is an essential guide for serious Python programmers. Python is a modular language that imports most useful operations from the standard library (basic support modules; operating system interfaces; network protocols; file formats; data conversions; threads and processes; and data storage). You can't really program in Python without using it. In this book,
author Fredrik Lundh, creator of the Python Imaging Library {(PIL),} delivers tested, accurate documentation of all the modules in the Python Standard Library, along with over 300 annotated example scripts using the modules. Python Standard Library renders this valuable information in a clean, easy-to-read format, yet doesn't talk down to readers. This accurate and complete reference documentation is for the Python programmer who wants the facts and little else. The book is based on the author's work with the Python newsgroup: he reviewed more than 2500 questions and answers to that newsgroup in order to make sure the book covered what Python users really wanted to know. An earlier version of this book has been available electronically for over a year, so the material has been tested by Python programmers in real-life applications. This version of Python Standard Library covers all the new modules and related information for Python 2.0, the first new major release of Python in four years.},
publisher = {{O'Reilly} Media},
author = {Fredrik Lundh},
year = {2001},
keywords = {Bibliothèque numérique, Informatique},
annote = {{{\textless}p{\textgreater}Accessible} en ligne via Safari Books Online (http://proquest.safaribooksonline.com/0596000960){\textless}/p{\textgreater}} },
-
F. Sebastiani, "Organizing and using digital libraries by automated text categorization," in Proceedings of the AI*IA workshop on artificial intelligence for cultural heritage and digital libraries, 2001, p. 93.
@inproceedings{sebastiani_organizing_2001, title = {Organizing and using digital libraries by automated text categorization},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.3668&rep=rep1&type=pdf},
doi = {10.1.1.21.3668},
abstract = {When it was proclaimed that the Library contained all books, the first impression was one of extravagant happiness. All men felt themselves to be the masters of an intact and secret treasure. There was no personal or world problem whose eloquent solution did not exist in some hexagon. (...) As was natural, this inordinate hope was followed by an excessive depression. The certitude that some shelf in some hexagon held precious books and that these precious books were inaccessible, seemed almost intolerable. {[Jorge} Luis Borges, The Library of Babel, 1941]},
booktitle = {Proceedings of the {AI*IA} workshop on artificial intelligence for cultural heritage and digital libraries},
publisher = {Luciana Bordoni and Giovanni Semeraro},
author = {Fabrizio Sebastiani},
year = {2001},
keywords = {Bibliothèque numérique, Catégorisation},
pages = {93---94},
annote = {{{\textless}p{\textgreater}sebastianiFabrizio2001.pdf{\textless}/p{\textgreater}}} },