-
L. Wanner, R. Baeza-Yates, S. Brugmann, J. Codina, B. Diallo, E. Escorsa, M. Giereth, Y. Kompatsiaris, S. Papadopoulos, E. Pianta, G. Piella, I. Puhlmann, G. Rao, M. Rotard, P. Schoester, L. Serafini, and V. Zervaki, "Towards content-oriented patent document processing," World Patent Information, vol. 30, iss. 1, pp. 21-33, 2008.
@article{wanner_towards_2008, title = {Towards content-oriented patent document processing},
volume = {30},
url = {http://www.sciencedirect.com/science/article/B6V5D-4NT93N4-1/2/28ca16a1b7ef2db1ffd30377e0b37df4},
abstract = {In this article, we present ongoing work on an advanced patent processing service {PATExpert.} The central assumption underlying {PATExpert} is that in order to meet the needs of the users of patent processing services, recourse must be made to the content of patent material. We introduce a content representation schema for patent documentation and sketch the design of techniques that facilitate the integration of this schema into the patent processing cycle. Two types of techniques are discussed. Techniques of the first type facilitate the access to the content of patent documentation provided in a textual format - be it by the human reader or by the machine - in that they rephrase and summarize the documentation and map it onto a formal semantic representation. Techniques of the second type operate on the content representation. At this stage, {PATExpert} is explored in two technology areas - optical recording devices and machine tools. The work is being carried out in the framework of an {R\&D-project} partially funded by the European Commission.},
number = {1},
journal = {World Patent Information},
author = {Leo Wanner and Ricardo {Baeza-Yates} and Soren Brugmann and Joan Codina and Barrou Diallo and Enric Escorsa and Mark Giereth and Yiannis Kompatsiaris and Symeon Papadopoulos and Emanuele Pianta and Gemma Piella and Ingo Puhlmann and Gautam Rao and Martin Rotard and Pia Schoester and Luciano Serafini and Vasiliki Zervaki},
year = {2008},
keywords = {Classification, Extraction d'information, Ontologie, Visualisation de l'information},
pages = {21--33} },
-
D. Delen and MD, "Seeding the survey and analysis of research literature with text mining," Expert Systems with Applications, vol. 34, iss. 3, pp. 1707-1720, 2008.
@article{delen_seedingsurvey_2008, title = {Seeding the survey and analysis of research literature with text mining},
volume = {34},
issn = {0957-4174},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=24},
abstract = {Text mining is a semi-automated process of extracting knowledge from a large amount of unstructured data. Given that the amount of unstructured data being generated and stored is increasing rapidly, the need for automated means to process it is also increasing. In this study, we present, discuss and evaluate the techniques used to perform text mining on collections of textual information. A case study is presented using text mining to identify clusters and trends of related research topics from three major journals in the management information systems field. Based on the findings of this case study, it is proposed that this type of analysis could potentially be valuable for researchers in any field. {(C)} 2007 Elsevier Ltd. All rights reserved.},
number = {3},
journal = {Expert Systems with Applications},
author = {D Delen and {MD} Crossland},
month = apr, year = {2008},
keywords = {Catégorisation, Classification, Cluster, Extraction d'information, Fouille de texte},
pages = {1707--1720} },
-
Z. Zhang, "Mining relational data from text : from strictly supervised to weakly supervised learning," Information systems, vol. 33, iss. 3, pp. 300-314, 2008.
@article{zhang_mining_2008, title = {Mining relational data from text : from strictly supervised to weakly supervised learning},
volume = {33},
issn = {0306-4379},
shorttitle = {Mining relational data from text},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=23},
abstract = {This paper approaches the relation classification problem in information extraction framework with different machine learning strategies, from strictly supervised to weakly supervised. A number of learning algorithms are presented and empirically evaluated on a standard data set. We show that a supervised {SVM} classifier using various lexical and syntactic features can achieve competitive classification accuracy. Furthermore, a variety of weakly supervised learning algorithms can be applied to take advantage of large amount of unlabeled data when labeling is expensive. Newly introduced random-subspace-based algorithms demonstrate their empirical advantage over competitors in the context of both active learning and bootstrapping. {(C)} 2007 Elsevier {B.V.} All rights reserved.},
number = {3},
journal = {Information systems},
author = {Z Zhang},
month = may, year = {2008},
keywords = {Extraction d'information},
pages = {300--314} },
-
S. Wolthusen, "Automated extraction of behavioural profiles from document usage," BT Technology Journal, vol. 25, iss. 1, pp. 192-200, 2007.
@article{wolthusen_automated_2007, title = {Automated extraction of behavioural profiles from document usage},
volume = {25},
url = {http://www.ingentaconnect.com/content/klu/bttj/2007/00000025/00000001/00000020},
abstract = {Both human analysts and particularly automated tool suites are capable of deriving sensitive information and conclusions from collections of data items that individually cannot be considered critical or sensitive. This activity of analysing and correlating material that is not immediately related is, in fact, highly desirable in many application areas and cannot be controlled precisely in advance. The decision whether a program or an analyst is performing searches and correlations beyond the scope of his authorisation or current mission can frequently be determined only ex post based on a heuristic analysis of documents accessed. In this paper we describe a mechanism for the instrumentation of operating systems to obtain information on the documents and resources accessed by arbitrary processes. Such a mechanism could be an important component of the infrastructure of an operational risk management system, generating an audit trail for compliance and forensic investigation, and acting as a sensor generating data for analysis. Addressing the latter application, the paper also outlines an approach for extracting textual information and metadata from accessed documents, regardless of the application program and workflow mechanisms used, without unduly impeding either workflows or operator performance. This information can then be subjected to an heuristic analysis based on natural language processing to extract the semantic context of each document or segment. Clustering this content and extracting the conceptual patterns that a user has accessed can then allow abnormal behaviour to be identified. This can then be refined further to determine heuristically whether the authorised remit of the user has been breached and whether an investigation is warranted. We argue that the risk of misbehaviour can be reduced while at the same time increasing productivity. This is made possible by enhancing the degree of freedom for individual users to act in the interest of their mission objectives and at the same time providing automated mechanisms for analysing user behaviour.},
number = {1},
journal = {{BT} Technology Journal},
author = {S. Wolthusen},
year = {2007},
keywords = {Extraction d'information},
pages = {192--200} },
-
B. Tso and P. Y. Chang, "Mining free-structured information based on hidden Markov models," Expert Systems with Applications, vol. 32, iss. 1, pp. 97-102, 2007.
@article{tso_mining_2007, title = {Mining free-structured information based on hidden Markov models},
volume = {32},
url = {http://www.sciencedirect.com/science/article/B6V03-4HYMSJX-3/2/c1582b7f072c58e30d8c108aaca55552},
abstract = {The potentials of hidden Markov models {(HMM)} in mining free-structured information are investigated in this study. The samples under test are relating to {C4ISR} information derived from the contents of {`Forecast} International', which is a web-based database containing free-structured archive of forecast reports about aerospace systems, weapon systems, and military industries. This study focuses on three {C4ISR} relating target terms, namely, {`Company',} {`System} types', and `cost', for information mining analysis. The experiments are performed in two stages. In the first stage, each {HMM} being built is exclusively serving for one target term information extraction so as to test the {HMM} fundamental information extraction capability. While in the second stage, the experiment is then extended to resolve a more complex, multiple term extraction issue. The results reveal that, by using {HMMs} as a basis, the accuracies can all achieve more than 80\% for single target term extraction, and 76\% in average for multi-term extraction case.},
number = {1},
journal = {Expert Systems with Applications},
author = {Brandt Tso and Paul Y. Chang},
year = {2007},
keywords = {Extraction d'information},
pages = {97--102} },
-
P. Zweigenbaum, D. Demner-Fushman, H. Yu, and K. B. Cohen, "Frontiers of biomedical text mining : current progress," Briefings in Bioinformatics, vol. 8, iss. 5, pp. 358-375, 2007.
@article{zweigenbaum_frontiers_2007, title = {Frontiers of biomedical text mining : current progress},
volume = {8},
issn = {14675463},
shorttitle = {Frontiers of biomedical text mining},
doi = {Article},
abstract = {It is now almost 15 years since the publication of the first paper on text mining in the genomics domain, and decades since the first paper on text mining in the medical domain. Enormous progress has been made in the areas of information retrieval, evaluation methodologies and resource construction. Some problems, such as abbreviation- handling, can essentially be considered solved problems, and others, such as identification of gene mentions in text, seem likely to be solved soon. However, a number of problems at the frontiers of biomedical text mining continue to present interesting challenges and opportunities for great improvements and interesting research. In this article we review the current state of the art in biomedical text mining or {‘BioNLP’} in general, focusing primarily on papers published within the past year. {ABSTRACT} {FROM} {AUTHOR} Copyright of Briefings in Bioinformatics is the property of Oxford University Press / {UK} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {5},
journal = {Briefings in Bioinformatics},
author = {Pierre Zweigenbaum and Dina {Demner-Fushman} and Hong Yu and Kevin B. Cohen},
year = {2007},
keywords = {Extraction d'information, Fouille de texte, Recherche d'information},
pages = {358--375},
annote = {{{\textless}p{\textgreater}Accession} Number: 27743245; Zweigenbaum, Pierre; Email Address: pz@limsi.ft; {Demner-Fushman,} Dina; Hong Yu; Cohen, Kevin B.; Issue Info: Sep2007, Vol. 8 Issue 5, p358; Thesaurus Term: {DATA} mining; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {INFORMATION} resources; Thesaurus Term: {SEARCH} engines; Thesaurus Term: {INFORMATION} science; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {MEDICAL} sciences; Subject Term: {MOLECULAR} genetics; {Author-Supplied} Keyword: evaluation; {Author-Supplied} Keyword: image mining; {Author-Supplied} Keyword: in formation extraction; {Author-Supplied} Keyword: literature-based discovery; {Author-Supplied} Keyword: natural language processing; {Author-Supplied} Keyword: question answering; {Author-Supplied} Keyword: text mining; {Author-Supplied} Keyword: text summarization; {Author-Supplied} Keyword: user orientation; Number of Pages: 18p; Document Type: Article{\textless}/p{\textgreater}} },
-
V. de Boer, M. van Someren, and B. J. Wielinga, "A redundancy-based method for the extraction of relation instances from the Web," International Journal of Human-Computer Studies, vol. 65, iss. 9, p. 816-831, 2007.
@article{de_boer_redundancy-based_2007, title = {A redundancy-based method for the extraction of relation instances from the Web},
volume = {65},
url = {http://www.sciencedirect.com/science/article/B6WGR-4NWKCRF-1/2/61858c2a4bdc6fc9f2fd956519c53089},
abstract = {The Semantic Web requires automatic ontology population methods. We developed an approach, that given existing ontologies, extracts instances of ontology relations, a specific subtask of ontology population. We use generic, domain-independent techniques to extract candidate relation instances from the Web and exploit the redundancy of information on the Web to compensate for loss of precision caused by the use of these generic methods. The candidate relation instances are then ranked based on co-occurrence with a small seed set. In an experiment, we extracted instances of the relation between artists and art styles. The results were manually evaluated against selected art resources. The method was also tested in the football domain. We also compare the performance of our ranking to that of a Google-hit count-based method.},
number = {9},
journal = {International Journal of {Human-Computer} Studies},
author = {Viktor de Boer and Maarten van Someren and Bob J. Wielinga},
year = {2007},
keywords = {Extraction d'information, Web sémantique},
pages = {816--831}
-
M. J. Weal, H. Alani, S. Kim, P. H. Lewis, D. E. Millard, P. A. S. Sinclair, D. D. C. Roure, and N. R. Shadbolt, "Ontologies as facilitators for repurposing web documents," International Journal of Human-Computer Studies, vol. 65, iss. 6, pp. 537-562, 2007.
@article{weal_ontologies_2007, title = {Ontologies as facilitators for repurposing web documents},
volume = {65},
url = {http://www.sciencedirect.com/science/article/B6WGR-4N56BWX-1/2/e4a530b7c96d9b83a8aec930f803e8e9},
abstract = {This paper investigates the role of ontologies as a central part of an architecture to repurpose existing material from the web. A prototype system called {ArtEquAKT} is presented, which combines information extraction, knowledge management and consolidation techniques and adaptive document generation. All of these components are co-ordinated using one central ontology, providing a common vocabulary for describing the information fragments as they are processed. Each of the components of the architecture is described in detail and an evaluation of the system discussed. Conclusions are drawn as to the effectiveness of such an approach and further challenges are outlined.},
number = {6},
journal = {International Journal of {Human-Computer} Studies},
author = {Mark J. Weal and Harith Alani and Sanghee Kim and Paul H. Lewis and David E. Millard and Patrick A. S. Sinclair and David C. De Roure and Nigel R. Shadbolt},
year = {2007},
keywords = {Document numérique, Extraction d'information, Web},
pages = {537--562} },
-
M. Ruiz-Casado, E. Alfonseca, and P. Castells, "Automatising the learning of lexical patterns : an application to the enrichment of WordNet by extracting semantic relationships from Wikipedia," Data \& Knowledge Engineering, vol. 61, iss. 3, pp. 484-499, 2007.
@article{ruiz-casado_automatisinglearning_2007, title = {Automatising the learning of lexical patterns : an application to the enrichment of {WordNet} by extracting semantic relationships from Wikipedia},
volume = {61},
url = {http://www.sciencedirect.com/science/article/B6TYX-4KF1FYV-5/2/5d9f1244b049a53f3ccb6b0a762e355e},
abstract = {This paper describes an automatic approach to identify lexical patterns that represent semantic relationships between concepts in an on-line encyclopedia. Next, these patterns can be applied to extend existing ontologies or semantic networks with new relations. The experiments have been performed with the Simple English Wikipedia and {WordNet} 1.7. A new algorithm has been devised for automatically generalising the lexical patterns found in the encyclopedia entries. We have found general patterns for the hyperonymy, hyponymy, holonymy and meronymy relations and, using them, we have extracted more than 2600 new relationships that did not appear in {WordNet} originally. The precision of these relationships depends on the degree of generality chosen for the patterns and the type of relation, being around 60-70\% for the best combinations proposed.},
number = {3},
journal = {Data \& Knowledge Engineering},
author = {Maria {Ruiz-Casado} and Enrique Alfonseca and Pablo Castells},
year = {2007},
keywords = {Extraction d'information, Ontologie},
pages = {484--499} },
-
Y. Li and K. Bontcheva, "Hierarchical, perceptron-like learning for ontology-based information extraction," , Banff, Alberta, Canada, 2007, pp. 777-786.
@inproceedings{li_hierarchical_2007, address = {Banff, Alberta, Canada},
title = {Hierarchical, perceptron-like learning for ontology-based information extraction},
isbn = {978-1-59593-654-7},
url = {http://portal.acm.org/ft_gateway.cfm?id=1242677&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1242572.1242677},
abstract = {Recent work on ontology-based Information Extraction {(IE)} has tried to make use of knowledge from the target ontology in order to improve semantic annotation results. However, very few approaches exploit the ontology structure itself, and those that do so, have some limitations. This paper introduces a hierarchical learning approach for {IE,} which uses the target ontology as an essential part of the extraction process, by taking into account the relations between concepts. The approach is evaluated on the largest available semantically annotated corpus. The results demonstrate clearly the benefits of using knowledge from the ontology as input to the information extraction process. We also demonstrate the advantages of our approach over other state-of-the-art learning systems on a commonly used benchmark dataset.},
publisher = {{ACM}},
author = {Yaoyong Li and Kalina Bontcheva},
year = {2007},
keywords = {Extraction d'information, Ontologie},
pages = {777--786},
annote = {{{\textless}p{\textgreater}liYaoyong2007.pdf{\textless}/p{\textgreater}}} },
-
M. Labsky, M. Nekvasil, and V. Svatek, "Towards web information extraction using extraction ontologies and (indirectly) domain ontologies," , Whistler, BC, Canada, 2007, pp. 201-202.
@inproceedings{labsky_towards_2007, address = {Whistler, {BC,} Canada},
title = {Towards web information extraction using extraction ontologies and (indirectly) domain ontologies},
isbn = {978-1-59593-643-1},
url = {http://portal.acm.org/citation.cfm?id=1298406.1298454&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1298406.1298454},
abstract = {Extraction ontologies allow to swiftly proceed from initial domain modelling to running a functional prototype of a web information extraction application. We investigate the possibility of semi-automatically deriving extraction ontologies from third-party domain ontologies.},
publisher = {{ACM}},
author = {Martin Labsky and Marek Nekvasil and Vojtech Svatek},
year = {2007},
keywords = {Extraction d'information, Ontologie, Web},
pages = {201--202},
annote = {{{\textless}p{\textgreater}labskyMartin2007.pdf{\textless}/p{\textgreater}}} },
-
H. Mangassarian and H. Artail, "A general framework for subjective information extraction from unstructured English text," Data \& Knowledge Engineering, vol. 62, iss. 2, pp. 352-367, 2007.
@article{mangassarian_general_2007, title = {A general framework for subjective information extraction from unstructured English text},
volume = {62},
url = {http://www.sciencedirect.com/science/article/B6TYX-4M9455T-1/2/b19a327662e8c1d318a0c3e3465a0189},
abstract = {In this paper, we present an information extraction {(IE)} strategy for handling subjective information from unstructured text. The presented methodology is general: it can be useful in many real-life applications that could potentially benefit from an automatic {IE} system that makes human-like decisions. We test our methodology in the sphere of company news evaluation with respect to the potential effect of the news on the company's stock prices. The described general framework comprises four sequential processing steps: part-of-speech tagging, syntactic parsing, relation generation, and criteria evaluation. The first two steps perform generic {NLP} tasks, while the last two phases are application-specific and require a thorough understanding of the application domain. We describe each stage and illustrate the flow of the modus operandi. We keep up with the company news evaluation example throughout the paper. Due to the inherent subjectivity of the envisaged problem, results cannot be categorically justified. However, comparing the system's evaluation of company news to our own, the results were very encouraging.},
number = {2},
journal = {Data \& Knowledge Engineering},
author = {Hratch Mangassarian and Hassan Artail},
year = {2007},
keywords = {Extraction d'information},
pages = {352--367} },
-
Y. Yildirim, T. Yilmaz, and A. Yazici, "Ontology-supported object and event extraction with a genetic algorithms approach for object classification," , Amsterdam, The Netherlands, 2007, pp. 202-209.
@inproceedings{yildirim_ontology-supported_2007, address = {Amsterdam, The Netherlands},
title = {Ontology-supported object and event extraction with a genetic algorithms approach for object classification},
isbn = {978-1-59593-733-9},
url = {http://portal.acm.org/ft_gateway.cfm?id=1282314&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1282280.1282314},
abstract = {Current solutions are still far from reaching the ultimate goal, namely to enable users to retrieve the desired video clip among massive amounts of visual data in a semantically meaningful manner. With this study we propose a video database model {(OVDAM)} that provides automatic object, event and concept extraction. By using training sets and expert opinions, low-level feature values for objects and relations between objects are determined. {N-Cut} image segmentation algorithm is used to determine segments in video keyframes and the genetic algorithm-based classifier is used to make classification of segments (candidate objects) to objects. At the top level ontology of objects, events and concepts are used. Objects and/or events use all these information to generate events and concepts. The system has a reliable video data model, which gives the user the ability to make ontology-supported fuzzy querying. {RDF} is used to represent metadata. {OWL} is used to represent ontology and {RDQL} is used for querying. Queries containing objects, events, spatio-temporal clauses, concepts and low-level features are handled.},
publisher = {{ACM}},
author = {Yakup Yildirim and Turgay Yilmaz and Adnan Yazici},
year = {2007},
keywords = {Extraction d'information, Fuzzy},
pages = {202--209},
annote = {{{\textless}p{\textgreater}yildirimYakup2007.pdf{\textless}/p{\textgreater}}} },
-
J. H. F. Jr, "Knowledge creation in marketing: the role of predictive analytics," European Business Review, vol. 19, iss. 4, pp. 303-315, 2007.
@article{joe_f._hair_jr_knowledge_2007, title = {Knowledge creation in marketing: the role of predictive analytics},
volume = {19},
issn = {{0955-534X}},
url = {http://www.emeraldinsight.com/10.1108/09555340710760134},
abstract = {Purpose – The purpose of this paper is to provide an overview of predictive analytics, summarize how it is impacting knowledge creation in marketing, and suggest future developments in marketing and predictive analytics for both organizations and researchers. Design/methodology/approach – Survival in a knowledge-based economy is derived from the ability to convert information to knowledge. To do so, researchers and managers increasingly are relying on the field of predictive analytics. Data mining identifies and confirms relationships between explanatory and criterion variables. Predictive analytics uses confirmed relationships between variables to predict future outcomes. The predictions are most often values suggesting the likelihood a particular behavior or event will take place in the future. Findings – Data mining and predictive analytics are increasingly popular because of the substantial contributions they can make in converting information to knowledge. Marketing is among the most frequent applications of the techniques, and whether you think about product development, advertising, distribution and retailing, or marketing research and business intelligence, data mining and predictive analytics increasingly are being applied. Originality/value – In the future, we can expect predictive analytics to increasingly be applied to databases in all fields and revolutionize the ability to identify, understand and predict future developments, data analysts will increasingly rely on mixed-data models that examine both structured (numbers)and unstructured (text and images) data, statistical tools will be more powerful and easier to use, future applications will be global and real time, demand for data analysts will increase as will the need for students to learn data analysis methods, and scholarly researchers will need to improve their quantitative skills so the large amounts of information available can be used to create knowledge instead of information overload.},
number = {4},
journal = {European Business Review},
author = {Joe F. Hair Jr},
year = {2007},
keywords = {Approche statistique, Extraction d'information},
pages = {303 -- 315} },
-
M. Banko, M. J. Cafarella, S. Soderland, M. Broadhead, and O. Etzioni, "Open information extraction from the web," Procs. of IJCAI, 2007.
@article{banko_open_2007, title = {Open information extraction from the web},
journal = {Procs. of {IJCAI}},
author = {M. Banko and M. J. Cafarella and S. Soderland and M. Broadhead and O. Etzioni},
year = {2007},
keywords = {Extraction d'information, Web} },
-
S. Roberson and D. Dicheva, "Semi-automatic ontology extraction to create draft topic maps," , Winston-Salem, North Carolina, 2007, pp. 100-105.
@inproceedings{roberson_semi-automatic_2007, address = {{Winston-Salem,} North Carolina},
title = {Semi-automatic ontology extraction to create draft topic maps},
url = {http://doi.acm.org/10.1145/1233341.1233360},
abstract = {Topic maps are a Semantic Web technology that provides a human-oriented mechanism to encode knowledge by organizing web information around topics. Studies have shown, however, that authors face major difficulties in constructing topic maps. This paper discusses an approach to automatic construction of a "draft" topic map for the authors to start with. The idea is to extract topic map constructs by crawling a website and parsing its pages. We propose a set of heuristics that can be used for extracting semantic information from the {HTML} markup of the web pages. We have used this approach to design and implement a plug-in for the topic map editor {TM4L} that automatically extracts topics and relationships from a website specified by the author. An evaluation of the proposed approach in terms of Recall and Precision of the extracted data is presented.},
author = {Steven Roberson and Darina Dicheva},
year = {2007},
keywords = {Extraction d'information, Ontologie},
pages = {100 -- 105} },
-
H. Kirsch, S. Gaudan, and D. Rebholz-Schuhmann, "Distributed modules for text annotation and IE applied to the biomedical domain," International Journal of Medical Informatics, vol. 75, iss. 6, pp. 496-500, 2006.
@article{kirsch_distributed_2006, title = {Distributed modules for text annotation and {IE} applied to the biomedical domain},
volume = {75},
url = {http://www.sciencedirect.com/science/article/B6T7S-4GTW002-3/2/4576e464817da713c80074f0ae0ef342},
abstract = {Summary Biological databases contain facts from scientific literature that have been curated by hand to ensure high quality. Curation is time-consuming and can be supported by information extraction methods. We present a server software infrastructure which allows to easily plug in modules to identify biologically interesting pieces of text to be then presented in a web interface to the curator. There are modules which identify {UniProt,} {UMLS} and {GO} terminology, gene and protein names, mutations and protein-protein interactions. {UniProt,} {UMLS} and {GO} concepts are automatically linked to the original source. The module for mutations is based on syntax patterns and the one for protein-protein interactions relies on chunk parsing. All modules work as separate servers possibly distributed on different machines and can be combined into processing pipelines as necessary. Communication is based on {XML} annotated text streams, each server processing the {XML} elements it is designed for, and possibly adding more information in the form of {XML} annotation. The server and the underlying software are available to the public.},
number = {6},
journal = {International Journal of Medical Informatics},
author = {Harald Kirsch and Sylvain Gaudan and Dietrich {Rebholz-Schuhmann}},
year = {2006},
keywords = {Extraction d'information},
pages = {496--500} },
-
. SRIHARI, . LI, T. CORNELL, and C. NIU, "InfoXtract : a customizable intermediate level information extraction engine," Natural Language Engineering, vol. 14, iss. 01, pp. 33-69, 2006.
@article{srihari_infoxtract_2006, title = {{InfoXtract} : a customizable intermediate level information extraction engine},
volume = {14},
shorttitle = {{InfoXtract}},
number = {01},
journal = {Natural Language Engineering},
author = {{RK} {SRIHARI} and {WEI} {LI} and T. {CORNELL} and C. {NIU}},
year = {2006},
keywords = {Extraction d'information},
pages = {33--69} },
-
C. Welty and J. W. Murdock, "Towards knowledge acquisition from information extraction," Proc. 5th International Semantic Web Conference, 2006.
@article{welty_towards_2006, title = {Towards knowledge acquisition from information extraction},
journal = {Proc. 5th International Semantic Web Conference},
author = {C. Welty and J. W. Murdock},
year = {2006},
keywords = {Extraction d'information} },
-
Sung-Won and Hyuk-Chul, "A scalable hybrid approach for extracting head components from Web tables," Knowledge and Data Engineering, IEEE Transactions on, vol. 18, iss. 2, pp. 174-187, 2006.
@article{sung-won_jung_scalable_2006, title = {A scalable hybrid approach for extracting head components from Web tables},
volume = {18},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/34468/01644735.pdf?tp=&isnumber=34468&arnumber=1644735&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2006.130},
abstract = {We have established a preprocessing method for determining the meaningfulness of a table to allow for information extraction from tables on the Internet. A table offers a preeminent clue in text mining because it contains meaningful data displayed in rows and columns. However, tables are used on the Internet for both knowledge structuring and document design. Therefore, we were interested in determining whether or not a table has meaningfulness that is related to the structural information provided at the abstraction level of the table head. Accordingly, we: 1) investigated the types of tables present in {HTML} documents, 2) established the features that distinguished meaningful tables from others, 3) constructed a training data set using the established features after having filtered any obvious decorative tables, and 4) constructed a classification model using a decision tree. Based on these features, we set up heuristics for table head extraction from meaningful tables, and obtained an F-measure of 95.0 percent in distinguishing meaningful tables from decorative tables and an accuracy of 82.1 percent in extracting the table head from the meaningful tables.},
number = {2},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {{Sung-Won} Jung and {Hyuk-Chul} Kwon},
year = {2006},
keywords = {Classification, Extraction d'information},
pages = {174--187},
annote = {{{\textless}p{\textgreater}jungSung-won2006.pdf{\textless}/p{\textgreater}}} },
-
J. Vivaldi and H. Rodriguez, "Some notes about the evaluation of terms and term extraction systems," in 5th International Conference on Language Resources and Evaluation, 2006.
@inproceedings{vivaldi_notes_2006, title = {Some notes about the evaluation of terms and term extraction systems},
booktitle = {5th International Conference on Language Resources and Evaluation},
author = {Jorge Vivaldi and Horacio Rodriguez},
year = {2006},
keywords = {Extraction d'information, Fouille de texte} },
-
R. Ghani, K. Probst, Y. Liu, M. Krema, and A. Fano, "Text mining for product attribute extraction," ACM SIGKDD Explorations Newsletter, vol. 8, iss. 1, pp. 41-48, 2006.
@article{ghani_text_2006, title = {Text mining for product attribute extraction},
volume = {8},
url = {http://portal.acm.org/ft_gateway.cfm?id=1147241&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1147234.1147241},
abstract = {We describe our work on extracting attribute and value pairs from textual product descriptions. The goal is to augment databases of products by representing each product as a set of attribute-value pairs. Such a representation is beneficial for tasks where treating the product as a set of attribute-value pairs is more useful than as an atomic entity. Examples of such applications include demand forecasting, assortment optimization, product recommendations, and assortment comparison across retailers and manufacturers. We deal with both implicit and explicit attributes and formulate both kinds of extractions as classification problems. Using single-view and multi-view semi-supervised learning algorithms, we are able to exploit large amounts of unlabeled data present in this domain while reducing the need for initial labeled data that is expensive to obtain. We present promising results on apparel and sporting goods products and show that our system can accurately extract attribute-value pairs from product descriptions. We describe a variety of application that are built on top of the results obtained by the attribute extraction system.},
number = {1},
journal = {{ACM} {SIGKDD} Explorations Newsletter},
author = {Rayid Ghani and Katharina Probst and Yan Liu and Marko Krema and Andrew Fano},
year = {2006},
keywords = {Extraction d'information, Fouille de texte},
pages = {41--48},
annote = {{{\textless}p{\textgreater}ghaniRayid2006.pdf{\textless}/p{\textgreater}}} },
-
C. H. Chang, M. Kayed, M. R. Girgis, and K. F. Shaalan, "A survey of Web information extraction systems," IEEE transaction on knowledge and data engineering, pp. 1411-1428, 2006.
@article{chang_survey_2006, title = {A survey of Web information extraction systems},
journal = {{IEEE} transaction on knowledge and data engineering},
author = {C. H. Chang and M. Kayed and M. R. Girgis and K. F. Shaalan},
year = {2006},
keywords = {Extraction d'information},
pages = {1411--1428} },
-
S. Hélie, R. Proulx, B. Lefebvre, R. Sun, and M. N., "JPEX: A psychologically plausible joint probability extractor," , Vancouver, B.C., 2006, pp. 1482-1487.
@inproceedings{hlie_jpex:psychologically_2006, address = {Vancouver, {B.C.}},
title = {{JPEX:} A psychologically plausible joint probability extractor},
abstract = {Extracting redundancies in the data is the main purpose of unsupervised learning and estimating the covariance using Hebbian learning is a widespread way to achieve this. However, Hebbian learning only leads to the extraction of between-unit covariance. Because most associative memories use distributed representations, it would be more useful to extract the covariance of states. Yet, this operation would still be insufficient to fully model more complex environments, which include higher-order relations. In the present paper, we propose a new architecture, {JPEX,} which extracts higher-order joint probabilities at the state level using the tensor product as a learning rule. This new learning rule is compared with simple Hebbian learning in an environment which includes second-order relations. Also, {JPEX’s} ability to learn non-linear relationships is illustrated by training the model on the {XOR} categorization problem.},
author = {Sébastien Hélie and Robert Proulx and Bernard Lefebvre and R. Sun and Miyake. N.},
year = {2006},
keywords = {Extraction d'information},
pages = {1482--1487} },
-
G. Geleijnse and J. Korst, "Learning effective surface text patterns for information extraction," Proceedings of the EACL 2006 workshop on Adaptive Text Extraction and Mining (ATEM 2006), p. 1, 2006.
@article{geleijnse_learning_2006, title = {Learning effective surface text patterns for information extraction},
journal = {Proceedings of the {EACL} 2006 workshop on Adaptive Text Extraction and Mining {(ATEM} 2006)},
author = {G. Geleijnse and J. Korst},
year = {2006},
keywords = {Extraction d'information},
pages = {1–8} },
-
PC, N. Iqbal, and SY, "Non-negative matrix factorization based text mining: Feature extraction and classification," Neural information processing, Pt 2, proceedings, vol. 4233, pp. 703-712, 2006.
@article{barman_non-negative_2006, title = {Non-negative matrix factorization based text mining: Feature extraction and classification},
volume = {4233},
issn = {0302-9743},
shorttitle = {Non-negative matrix factorization based text mining},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=41},
abstract = {The unlabeled document or text collections are becoming larger and larger which is common and obvious; mining such data sets are a challenging task. Using the simple word-document frequency matrix as feature space the mining process is becoming more complex. The text documents are often represented as high dimensional about few thousand sparse vectors with sparsity about 95 to 99\% which significantly affects the efficiency and the results of the mining process. In this paper, we propose the two-stage Non-negative Matrix Factorization {(NMF):} in the first stage we tried to extract the uncorrelated basis probabilistic document feature vectors by significantly reducing the dimension of the feature vectors of the word-document frequency from few thousand to few hundred, and in the second stage for clustering or classification. In our propose approach it has been observed that the clustering or classification performance with more than 98.5\% accuracy. The dimension reduction and classification performance has observed for the Classic3 dataset.},
journal = {Neural information processing, Pt 2, proceedings},
author = {{PC} Barman and N Iqbal and {SY} Lee},
year = {2006},
keywords = {Classification, Extraction d'information, Fouille de texte},
pages = {703--712} },
-
L. J. Jensen, J. Saric, and P. Bork, "Literature mining for the biologist : from information retrieval to biological discovery," Nat Rev Genet, vol. 7, iss. 2, pp. 119-29, 2006.
@article{jensen_literature_2006, title = {Literature mining for the biologist : from information retrieval to biological discovery},
volume = {7},
shorttitle = {Literature mining for the biologist},
number = {2},
journal = {Nat Rev Genet},
author = {L. J. Jensen and J. Saric and P. Bork},
year = {2006},
keywords = {Extraction d'information, Fouille de texte},
pages = {119--29} },
-
P. Cimiano, F. Ciravegna, J. Domingue, S. Handschuh, A. Lavelli, S. Staab, and M. Stevenson, "Requirements for information extraction for knowledge management," , 2006.
@article{cimiano_requirements_2006, title = {Requirements for information extraction for knowledge management},
url = {http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-101/Philipp_Cimiano-et-al.pdf},
abstract = {Knowledge Management {(KM)} systems inherently suffer from the knowledge acquisition bottleneck - the difficulty of modeling and formalizing knowledge relevant for specific domains. A potential solution to this problem is Information Extraction {(IE)} technology. However, {IE} was originally developed for database population and there is a mismatch between what is required to successfully perform {KM} and what current {IE} technology provides. In this paper we begin to address this issue by outlining requirements for {IE} based {KM.}},
author = {Philipp Cimiano and Fabio Ciravegna and John Domingue and Siegfried Handschuh and Alberto Lavelli and S. Staab and Mark Stevenson},
year = {2006},
keywords = {Extraction d'information, Gestion des connaissances, Ontologie},
annote = {{{\textless}p{\textgreater}cimianoPhilipp2006.pdf{\textless}/p{\textgreater}}} },
-
A. H. Doan, R. Ramakrishnan, and S. Vaithyanathan, "Managing information extraction : state of the art and research directions," Proceedings of the 2006 ACM SIGMOD international conference on Management of data, pp. 799-800, 2006.
@article{doan_managing_2006, title = {Managing information extraction : state of the art and research directions},
shorttitle = {Managing information extraction},
journal = {Proceedings of the 2006 {ACM} {SIGMOD} international conference on Management of data},
author = {A. H. Doan and R. Ramakrishnan and S. Vaithyanathan},
year = {2006},
keywords = {Extraction d'information},
pages = {799--800} },
-
P. Buitelaar, P. Cimiano, S. Racioppa, and M. Siegel, "Ontology-based information extraction with SOBA," Proc. of the International Conference on Language Resources and Evaluation (LREC), 2006.
@article{buitelaar_ontology-based_2006, title = {Ontology-based information extraction with {SOBA}},
journal = {Proc. of the International Conference on Language Resources and Evaluation {(LREC)}},
author = {P. Buitelaar and P. Cimiano and S. Racioppa and M. Siegel},
year = {2006},
keywords = {Extraction d'information, Ontologie} },
-
P. Jackson, F. Schilder, and B. Keith, "Natural language processing : overview." Oxford: Elsevier, 2006, pp. 503-518.
@incollection{jackson_natural_2006, address = {Oxford},
title = {Natural language processing : overview},
url = {http://www.sciencedirect.com/science/article/B7T84-4M3C3K0-7X/2/06667485f922693c691e49113551bbe0},
abstract = {The advent of the World Wide Web has greatly increased demand for software tools and appliances for processing unstructured and semi-structured natural language text. Ancillary developments, such as corporate intranets, enterprise portals, and ubiquitous e-mail, have created many challenges and opportunities in application areas such as information retrieval, electronic commerce, and knowledge management. On the supply side, the development of language technology to address such attendant problems as information overload and rapid globalization has been facilitated by two technical breakthroughs. The first is conceptual, and represents a new emphasis upon empirical approaches to language processing that rely more heavily upon corpus statistics than linguistic theory. The second is computational, and consists of more powerful, networked machines that are capable of processing millions of documents and performing the billions of calculations that the statistical profiling of large corpora requires. This article outlines the new application areas and describes some of the advances that have been made. The emphasis is upon showing how the technical approaches outlined elsewhere in this encyclopedia can be combined to create products and services that have genuine value.},
booktitle = {Encyclopedia of Language \& Linguistics},
publisher = {Elsevier},
author = {P. Jackson and F. Schilder and Brown Keith},
year = {2006},
keywords = {Catégorisation, Extraction d'information, Fouille de texte, Intelligence artificielle, Langage naturel},
pages = {503--518} },
-
M. Huang, X. Zhu, and M. Li, "A hybrid method for relation extraction from biomedical literature," International Journal of Medical Informatics, vol. 75, iss. 6, pp. 443-455, 2006.
@article{huang_hybrid_2006, title = {A hybrid method for relation extraction from biomedical literature},
volume = {75},
url = {http://www.sciencedirect.com/science/article/B6T7S-4GV98B6-3/2/cb8ee2049c9d78d290685d09a71a1ab0},
abstract = {Over recent years, there has been a growing interest in extracting entities and relations from biomedical literature. There are a vast number of systems and approaches being proposed to extract biological relations, but none of them achieves satisfactory results. These methodologies are either parsing-based or pattern-based, which are not competent to handle the grammatical complexities of biomedical texts, or too complicated to be adapted. It is well known that appositive, coordinative propositions and such grammatical structures are extremely common in biomedical texts, particularly in full texts. However, these problems are still untouched for most of {researchers.Methods} In this paper, we have proposed a new approach, which is hybrid with both shallow parsing and pattern matching, to extract relations between proteins from scientific papers of biomedical themes. In the method, appositive and coordinative structures are interpreted based on the shallow parsing analysis, with both syntactic and semantic constraints. Then long sentences are splitted into sub-ones, from which relations are extracted by a greedy pattern matching algorithm, along with automatically generated {patterns.Results} Our approach is experimented to extract protein-protein interactions from full biomedical texts, and has achieved an average F-score of 80\% on individual verbs, and 66\% on all verbs. With the help of shallow parsing analysis, pattern matching is improved remarkably. Compared with the traditional pattern matching algorithm, our approach achieves about 7\% improvement of both precision and F-score. In contrast to other systems, our approach achieves performance comparable to the best. A demo system has been available at http://spies.cs.tsinghua.edu.cn.},
number = {6},
journal = {International Journal of Medical Informatics},
author = {Minlie Huang and Xiaoyan Zhu and Ming Li},
year = {2006},
keywords = {Extraction d'information},
pages = {443--455} },
-
G. J. J. Adeva and R. Calvo, "Mining Text with Pimiento," IEEE Internet Computing, vol. 10, iss. 4, pp. 27-35, 2006.
@article{adeva_mining_2006, title = {Mining Text with Pimiento},
volume = {10},
url = {http://portal.acm.org/citation.cfm?id=1158822.1159024},
abstract = {Information systems are using an increasing amount of unstructured information in the form of text. This situation has spawned a need to improve the text-mining technologies needed for information retrieval, filtering, and classification. This article compares some of the options available and how they can provide textual data-mining functionalities to software applications. In particular, the authors focus on Pimiento, a new object-oriented application framework for text mining. This framework allows developers to easily create distributed applications that use machine learning and statistical techniques to automatically process documents.},
number = {4},
journal = {{IEEE} Internet Computing},
author = {J. J. García Adeva and R. Calvo},
year = {2006},
keywords = {Cluster, Extraction d'information, Fouille de texte},
pages = {27--35},
annote = {{{\textless}p{\textgreater}adevaJuanJose2006.pdf{\textless}/p{\textgreater}}} },
-
T. S. Jayram, R. Krishnamurthy, S. Raghavan, S. Vaithyanathan, and H. Zhu, "Avatar information extraction system," IEEE Data Engineering Bulletin, vol. 29, iss. 1, pp. 40-48, 2006.
@article{jayram_avatar_2006, title = {Avatar information extraction system},
volume = {29},
number = {1},
journal = {{IEEE} Data Engineering Bulletin},
author = {T. S. Jayram and R. Krishnamurthy and S. Raghavan and S. Vaithyanathan and H. Zhu},
year = {2006},
keywords = {Extraction d'information},
pages = {40--48} },
-
E. Riloff, J. Wiebe, and W. Phillips, "Exploiting subjectivity classification to improve information extraction," Proceedings of AAAI, 2005.
@article{riloff_exploiting_2005, title = {Exploiting subjectivity classification to improve information extraction},
url = {www2.cs.pitt.edu/~wiebe/pubs/papers/aaai05.pdf},
journal = {Proceedings of {AAAI}},
author = {E. Riloff and J. Wiebe and W. Phillips},
year = {2005},
keywords = {Classification, Extraction d'information},
annote = {{{\textless}p{\textgreater}riloffEllen2005.pdf{\textless}/p{\textgreater}}} },
-
R. J. Mooney and R. Bunescu, "Mining knowledge from text using information extraction," ACM SIGKDD Explorations Newsletter, vol. 7, iss. 1, pp. 3-10, 2005.
@article{mooney_mining_2005, title = {Mining knowledge from text using information extraction},
volume = {7},
issn = {1931-0145},
url = {http://proquest.umi.com/pqdweb?sid=1&vinst=PROD&fmt=6&startpage=-1&clientid=48948&vname=PQD&RTQ=309&did=726085741&caling=FULL&vtype=PQD&rqt=309&cfc=1&TS=1214944260&clientld=48948},
doi = {10.1145/1089815.1089817},
abstract = {An important approach to text mining involves the use of natural-language information extraction. Information extraction {(IE)} distills structured data or knowledge from unstructured text by identifying references to named entities as well as stated relationships between such entities. {IE} systems can be used to directly extricate abstract knowledge from a text corpus, or to extract concrete data from a set of documents which can then be further analyzed with traditional data-mining techniques to discover more general patterns. We discuss methods and implemented systems for both of these approaches and summarize results on mining real text corpora of biomedical abstracts, job announcements, and product descriptions. We also discuss challenges that arise when employing current information extraction technology to discover knowledge in text.},
number = {1},
journal = {{ACM} {SIGKDD} Explorations Newsletter},
author = {Raymond J. Mooney and Razvan Bunescu},
year = {2005},
keywords = {Extraction d'information, Fouille de texte},
pages = {3--10},
annote = {{{\textless}p{\textgreater}mooneyRaymond2005.pdf{\textless}/p{\textgreater}}} },
-
CC, SL, and LF, "Categorizing unknown text segments for information extraction using a search result mining approach," Natural Language Processing – IJCNLP 2004, vol. 3248, pp. 576-586, 2005.
@article{huang_categorizing_2005, title = {Categorizing unknown text segments for information extraction using a search result mining approach},
volume = {3248},
issn = {0302-9743},
abstract = {An advanced information extraction system requires an effective text categorization technique to categorize extracted facts (text segments) into a hierarchy of domain-specific topic categories. Text segments are often short and their categorization is quite different from conventional document categorization. This paper proposes a Web mining approach that exploits Web resources to categorize unknown text segments with limited manual intervention. The feasibility and wide adaptability of the proposed approach has been shown with extensive experiments on categorizing different kinds of text segments including domain-specific terms, named entities, and even paper titles into Yahoo!'s taxonomy trees.},
journal = {Natural Language Processing - {IJCNLP} 2004},
author = {{CC} Huang and {SL} Chuang and {LF} Chien},
year = {2005},
keywords = {Catégorisation, Extraction d'information},
pages = {576--586} },
-
C. Lemay, "Two methods for extracting "specific" single-word terms from specialized corpora : experimentation and evaluation," International Journal of Corpus Linguistics, vol. 10, pp. 227-255, 2005.
@article{lemay_two_2005, title = {Two methods for extracting "specific" single-word terms from specialized corpora : experimentation and evaluation},
volume = {10},
url = {http://portal.acm.org/ft_gateway.cfm?id=1298454&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {Recently, corpus comparison has been used by a number of researchers for extracting single-word terms {(SWTs)} from specialized corpora. It is viewed as a means to supplement multi-word term {(MWT)} extraction, the focus of which is on noun phrases. However, little is known about the value of this technique in a terminological setting. This paper examines two different methods for finding French {SWTs} in the field of computing. The first one {(M1)} compares the specialized corpus to a corpus considered to be a reflection of language as a whole. The second one {(M2)} breaks down the specialized corpus into six topical subcorpora that are compared in turn to the entire specialized corpus. The calculation relies on standard normal distribution and is carried out by a program {calledTermoStat.} The specific units produced by both methods are then evaluated by comparing them to the contents of two specialized dictionaries. We also compare the results yielded by the two methods. Results show that precision is fair (approximately 50\%of units extracted by both methods can be found in specialized dictionaries). However, recall is lower in both methods. Results also reveal that, even though M1 yields better results that M2, both methods are useful for identifying {SWTs} and should be considered in terminological work.},
journal = {International Journal of Corpus Linguistics},
author = {Chantal Lemay},
year = {2005},
keywords = {Extraction d'information, Méthodologie},
pages = {227--255} },
-
A. Vailaya, P. Bluvas, R. Kincaid, A. Kuchinsky, M. Creech, and A. Adler, An architecture for biological information extraction and representation, Oxford Univ Press, 2005, vol. 21.
@book{vailaya_architecture_2005, title = {An architecture for biological information extraction and representation},
volume = {21},
publisher = {Oxford Univ Press},
author = {A. Vailaya and P. Bluvas and R. Kincaid and A. Kuchinsky and M. Creech and A. Adler},
year = {2005},
keywords = {Extraction d'information} },
-
W. Aroonmanakun, "Collocation extract : a tool for extracting collocation," Journal of English Studies, iss. 2, pp. 28-39, 2005.
@article{aroonmanakun_collocation_2005, title = {Collocation extract : a tool for extracting collocation},
url = {http://pioneer.chula.ac.th/~awirote/ling/TU-ELT2005.pdf},
number = {2},
journal = {Journal of English Studies},
author = {Wirote Aroonmanakun},
year = {2005},
keywords = {Extraction d'information},
pages = {28--39},
annote = {{{\textless}p{\textgreater}aroonmanakunWrote2005.pdf{\textless}/p{\textgreater}}} },
-
J. R. Finkel, T. Grenager, and C. Manning, "Incorporating non-local information into information extraction systems by Gibbs sampling," Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics, pp. 363-370, 2005.
@article{finkel_incorporating_2005, title = {Incorporating non-local information into information extraction systems by Gibbs sampling},
journal = {Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics},
author = {J. R. Finkel and T. Grenager and C. Manning},
year = {2005},
keywords = {Extraction d'information},
pages = {363--370} },
-
S. M. Meystre, "Medical problem list automation using natural language processing," Dissertation Abstracts International, Section B: The Sciences and Engineering, vol. 66, iss. 7, p. 3596, 2005.
@article{meystre_medical_2005, title = {Medical problem list automation using natural language processing},
volume = {66},
url = {http://gateway.proquest.com/openurl?url_ver=Z39.88-2004&res_dat=xri:pqdiss&rft_val_fmt=info:ofi/fmt:kev:mtx:dissertation&rft_dat=xri:pqdiss:3181466},
abstract = {The electronic problem-oriented medical record was conceived to alleviate limitations of the paper-based medical record, and to improve its organization. The list of medical problems is at the heart of this problem-oriented record, and requires completeness, accuracy and timeliness to fulfill this central role. At Intermountain Health Care {(IHC),} a problem-oriented electronic medical record is being developed, and features a medical problem list at its core. This list is already in use in the outpatient setting, but is often incomplete, inaccurate and out-of-date. This issue is even more prominent for hospitalized patients. To help maintain a complete, accurate and timely problem list, I developed an Automated Problem List system using Natural Language Processing {(NLP)} to extract potential medical problems from the patient's electronic clinical documents. These problems are proposed to the user for inclusion in the "official" problem list, along with a link to allow viewing the documents the problem was extracted from. Two main applications compose this system. A background application does all documents processing and analysis using {NLP,} and the problem list management application allows viewing and editing these proposed problems. In the development of this system, the {NLP} module of the background application was evaluated first. This laboratory function study showed good recall and satisfying precision; accuracy was further improved by enhancing disambiguation and negation detection. A second study prospectively evaluated the whole Automated Problem List system in a clinical setting at the {LDS} Hospital. Patients benefiting from this system had more complete and timely problem lists. The sensitivity was higher, and the time between a medical problem's first mention in a clinical document and its addition to the list of problems was significantly reduced. In summary, this dissertation describes the planning, development, implementation and evaluation of a system using {NLP} to automatically extract medical problems from electronic clinical documents. This Automated Problem List system allowed better quality content of the problem list, opening doors to larger scale use of this system and contributing to possible answers to the challenge of making the problem list a cornerstone of our evolving clinical information system.},
number = {7},
journal = {Dissertation Abstracts International, Section B: The Sciences and Engineering},
author = {Stéphane M. Meystre},
year = {2005},
keywords = {Extraction d'information, Langage naturel, Linguistique},
pages = {3596},
annote = {{{\textless}p{\textgreater}MLA} International Bibliography{\textless}/p{\textgreater}} },
-
H. Cunningham, "Information extraction, automatic," Second European Semantic Web Conference (ESWC), vol. 9, pp. 64-84, 2005.
@article{cunningham_information_2005, title = {Information extraction, automatic},
volume = {9},
journal = {Second European Semantic Web Conference {(ESWC)}},
author = {H. Cunningham},
year = {2005},
keywords = {Extraction d'information},
pages = {64--84} },
-
P. Cimiano, U. Reyle, and J. Saric, "Ontology-driven discourse analysis for information extraction," Data \& Knowledge Engineering, vol. 55, iss. 1, pp. 59-83, 2005.
@article{cimiano_ontology-driven_2005, title = {Ontology-driven discourse analysis for information extraction},
volume = {55},
issn = {{0169-023X}},
abstract = {This paper presents a novel approach to discourse analysis within information extraction systems. It makes use of {DRT} as formal representation of the linguistic context as well as of a domain-specific ontology as a basis to compute conceptual relations between extracted events thus establishing discourse coherence. The approach has been implemented within {GenIE,} an information extraction system with the aim of extracting information about biochemical pathways, about sequences, structures and functions of genomes and proteins. The approach is evaluated against a semantically hand-annotated set of {Swiss-Prot} protein function descriptions and shows very promising results. (c) 2004 Elsevier {B.V.} All rights reserved.},
number = {1},
journal = {Data \& Knowledge Engineering},
author = {P. Cimiano and U. Reyle and J. Saric},
year = {2005},
keywords = {Analyse de discours, Extraction d'information, Langage naturel, Ontologie},
pages = {59--83} },
-
A. McCallum, "Information extraction: distilling structured data from unstructured text," Queue, vol. 3, iss. 9, pp. 48-57, 2005.
@article{mccallum_information_2005, title = {Information extraction: distilling structured data from unstructured text},
volume = {3},
shorttitle = {Information extraction},
number = {9},
journal = {Queue},
author = {A. {McCallum}},
year = {2005},
keywords = {Extraction d'information},
pages = {48--57} },
-
J. Natarajan, N. Mulay, C. DeSesa, CJ, W. Dubitzky, and EG, "A grid infrastructure for text mining of full text articles and creation of a knowledge base of gene relations," Biological and medical data analysis, proceedings, vol. 3745, pp. 101-108, 2005.
@article{natarajan_grid_2005, title = {A grid infrastructure for text mining of full text articles and creation of a knowledge base of gene relations},
volume = {3745},
issn = {0302-9743},
abstract = {We demonstrate the application of a grid infrastructure for conducting text mining over distributed data and computational resources. The approach is based on using {LexiQuest} Mine, a text mining workbench, in a grid computing environment. We describe our architecture and approach and provide an illustrative example of mining full-text journal articles to create a knowledge base of gene relations. The number of patterns found increased from 0.74 per full-text articles from a corpus of 1000 articles to 0.83 when the corpus contained 5000 articles. However, it was also shown that mining a corpus of 5000 full-text articles took 26 hours on a single computer, whilst the process was completed in less than 2.5 hours on a grid comprising of 20 computers. Thus whilst increasing the size of the corpus improved the efficiency of the text-mining process, a grid infrastructure was required to complete the task in a timely manner.},
journal = {Biological and medical data analysis, proceedings},
author = {J Natarajan and N Mulay and C {DeSesa} and {CJ} Hack and W Dubitzky and {EG} Bremer},
year = {2005},
keywords = {Extraction d'information},
pages = {101--108},
annote = {{\textless}p{\textgreater}url non fonctionnel enlevé {(Mylène){\textless}/p{\textgreater}} {\textless}p{\textgreater} {\textless}/p{\textgreater}} },
-
G. Angelova, "Language technologies meet ontology acquisition," in Conceptual structures : common semantics for sharing knowledge : 13th international conference on conceptual structures, ICCS 2005, Kassel, Germany, july 17-22, 2005 : proceedings, Berlin; Heidelberg, 2005, pp. 367-380.
@inproceedings{angelova_language_2005, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 3596. Lecture notes in artificial intelligence},
title = {Language technologies meet ontology acquisition},
isbn = {978-3-540-27783-5},
abstract = {This paper overviews and analyses the on-going research attempts to apply language technologies to automatic ontology acquisition. At first glance there are many successful approaches in this very hot field. However, most of them aim at the extraction of named entities as well as draft taxonomies and partonomies. Only few attempts exist for enriching ontologies by applying word-sense disambiguation. There are principle obstacles to extract automatically coherent conceptualisations from raw texts: it is impossible to identify exactly the types and their instances as well as the word meanings which denote types. It is also impossible to validate a text-based conceptual model against the real world. Thus we can expect only partial success in the semi-automatic acquisition in specific (limited) domains, by workbenches supporting the human knowledge engineer in the final ontological choices.},
booktitle = {Conceptual structures : common semantics for sharing knowledge : 13th international conference on conceptual structures, {ICCS} 2005, Kassel, Germany, july 17-22, 2005 : proceedings},
publisher = {Springer},
author = {Galia Angelova},
year = {2005},
keywords = {Extraction d'information, Langage naturel},
pages = {367--380} },
-
D. Damle and V. Uren, "Extracting significant words from corpora for ontology extraction," , Banff, Alberta, Canada, 2005, pp. 187-188.
-
G. S. Mann and D. Yarowsky, "Multi-field information extraction and cross-document fusion," Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics, pp. 483-490, 2005.
@article{mann_multi-field_2005, title = {Multi-field information extraction and cross-document fusion},
journal = {Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics},
author = {G. S. Mann and D. Yarowsky},
year = {2005},
keywords = {Extraction d'information},
pages = {483--490} },
-
K. Simon and G. Lausen, "ViPER : augmenting automatic information extraction with visual perceptions," Proceedings of the 14th ACM international conference on Information and knowledge management, pp. 381-388, 2005.
@article{simon_viper_2005, title = {{ViPER} : augmenting automatic information extraction with visual perceptions},
shorttitle = {{ViPER}},
journal = {Proceedings of the 14th {ACM} international conference on Information and knowledge management},
author = {K. Simon and G. Lausen},
year = {2005},
keywords = {Extraction d'information, Visualisation de l'information},
pages = {381--388} },
-
J. Azé, M. Roche, and M. Sebag, "Bagging evolutionary ROC-based hypotheses : application to terminology extraction," , Bonn, 2005.
@inproceedings{az_bagging_2005, address = {Bonn},
title = {Bagging evolutionary {ROC-based} hypotheses : application to terminology extraction},
url = {http://citeseer.ist.psu.edu/731801.html},
abstract = {The claim of the paper is that Evolutionary Learning is a source of diverse hypotheses “for free”, and this specificity can be used to combine in an ensemble the hypotheses learned in independent runs. The aim of our algorithm named Broger {(Bagging-ROC} {GEnetic} {LEarneR)} consists of optimizing the Area Under the {ROC} Curve using Evolutionary Learning. This paper first presents the theoretical framework of Broger and then its application to a Term Extraction task in Text Mining.},
author = {Jérôme Azé and Mathieu Roche and Michèle Sebag},
year = {2005},
keywords = {Extraction d'information},
annote = {{{\textless}p{\textgreater}Algorithm} = Broger {(Bagging-ROC} {GEnetic} {LEarneR)} Term extraction Biomedical{\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}azeJerome2005.pdf{\textless}/p{\textgreater}}},
annote = {{{\textless}p{\textgreater}The} claim of the paper is that Evolutionary Learning is a source of diverse hypotheses \"for free\", and this specificity can be used to combine in an ensemble the hypotheses learned in independent runs. The aim of our algorithm named Broger {(Bagging-ROC} {GEnetic} {LEarneR)} consists of optimizing the Area Under the {ROC} Curve using Evolutionary Learning. This paper first presents the theoretical framework of Broger and then its application to a Term Extraction task in Text Mining.{\textless}/p{\textgreater}} },
-
S. Sarawagi and W. W. Cohen, "Semi-markov conditional random fields for information extraction," Advances in Neural Information Processing Systems, vol. 17, p. 1185, 2005.
@article{sarawagi_semi-markov_2005, title = {Semi-markov conditional random fields for information extraction},
volume = {17},
journal = {Advances in Neural Information Processing Systems},
author = {S. Sarawagi and W. W. Cohen},
year = {2005},
keywords = {Extraction d'information},
pages = {1185–1192} },
-
M. Ben-Dov and R. Feldman, "Text mining and information extraction." New York: Springer, 2005, pp. 801-831.
@incollection{ben-dov_text_2005, address = {New York},
title = {Text mining and information extraction},
url = {http://www.springerlink.com/content/v1w5n480157582m8/},
abstract = {Text Mining is the automatic discovery of new, previously unknown information, by automatic analysis of various textual resources. Text mining starts by extracting facts and events from textual sources and then enables forming new hypotheses that are further explored by traditional Data Mining and data analysis methods. In this chapter we will define text mining and describe the three main approaches for performing information extraction. In addition, we will describe how we can visually display and analyze the outcome of the information extraction process.},
booktitle = {Data mining and knowledge discovery handbook},
publisher = {Springer},
author = {Moty {Ben-Dov} and Ronen Feldman},
year = {2005},
keywords = {Classification, Extraction d'information, Fouille de texte},
pages = {801--831} },
-
A. Koike, Y. Niwa, and T. Takagi, "Automatic extraction of gene/protein biological functions from biomedical text," Bioinformatics, vol. 21, iss. 7, pp. 1227-1236, 2005.
@article{koike_automatic_2005, title = {Automatic extraction of gene/protein biological functions from biomedical text},
volume = {21},
number = {7},
journal = {Bioinformatics},
author = {A. Koike and Y. Niwa and T. Takagi},
year = {2005},
keywords = {Extraction d'information},
pages = {1227--1236} },
-
L. Hirschman, A. Yeh, C. Blaschke, and A. Valencia, "Overview of BioCreAtIvE : critical assessment of information extraction for biology," feedback, 2005.
@article{hirschman_overview_2005, title = {Overview of {BioCreAtIvE} : critical assessment of information extraction for biology},
shorttitle = {Overview of {BioCreAtIvE}},
journal = {feedback},
author = {L. Hirschman and A. Yeh and C. Blaschke and A. Valencia},
year = {2005},
keywords = {Extraction d'information} },
-
D. Milward, "Ontology-based interactive information extraction from scientific abstracts," Comparative and Functional Genomics, vol. 6, pp. 67-71, 2005.
@article{milward_ontology-based_2005, title = {Ontology-based interactive information extraction from scientific abstracts},
volume = {6},
url = {http://www.ingentaconnect.com/content/jws/cfg/2005/00000006/F0020001/art00007},
abstract = {Over recent years, there has been a growing interest in extracting information automatically or semi-automatically from the scientific literature. This paper describes a novel ontology-based interactive information extraction {(OBIIE)} framework and a specific {OBIIE} system. We describe how this system enables life scientists to make ad hoc queries similar to using a standard search engine, but where the results are obtained in a database format similar to a pre-programmed information extraction engine. We present a case study in which the system was evaluated for extracting co-factors from {EMBASE} and {MEDLINE.} Copyright © 2005 John Wiley \& Sons, Ltd.},
journal = {Comparative and Functional Genomics},
author = {David Milward},
year = {2005},
keywords = {Extraction d'information, Ontologie},
pages = {67--71} },
-
X. Hu and X. Xu, "Mining novel connections from online biomedical text databases using semantic query expansion and semantic-relationship pruning," International Journal of Web and Grid Services, vol. 1, iss. 2, pp. 222-239, 2005.
@article{hu_mining_2005, title = {Mining novel connections from online biomedical text databases using semantic query expansion and semantic-relationship pruning},
volume = {1},
url = {http://www.ingentaconnect.com/content/ind/ijwgs/2005/00000001/00000002/art00005},
abstract = {This paper proposes a semantic-based approach for mining novel connections from biomedical literature. The method takes advantage of the biomedical ontologies, {MeSH} and {UMLS,} as the source of semantic knowledge. A prototype system, Biomedical Semantic-based Knowledge Discovery System {(Bio-SbKDS),} is designed to uncover novel hypotheses/connections hidden in biomedical literature through semantic query expansion and semantic-relationship pruning. {Bio-SbKDS} can automatically generate relevant search terms to retrieve the semantic-relevant articles from the online biomedical text databases. Using the semantic types and semantic relations of the biomedical concepts, {Bio-SbKDS} can identify the relevant concepts collected from Medline and generate the novel hypothesis between these concepts. {Bio-SbKDS} successfully replicates Dr. Swanson's two famous discoveries: Raynaud disease/fish oil and migraine/magnesium. Compared with previous approaches, our methods search much less articles, generate much less but more relevant novel hypotheses and require much less human intervention in the discovery procedure.},
number = {2},
journal = {International Journal of Web and Grid Services},
author = {Xiaohua Hu and Xuheng Xu},
year = {2005},
keywords = {Extraction d'information, Recherche d'information},
pages = {222--239} },
-
M. T. Pazienza, M. Pennacchiotti, M. Vindigni, and F. M. Zanzotto, "AI/NLP technologies applied to spacecraft mission design," in Innovations in applied artificial intelligence : 18th international conference on industrial and engineering applications of artificial intelligence and expert systems, IEA/AIE 2005, Bari, Italy, june 22-24, 2005 : proceedings, Heidelberg, 2005, pp. 239-248.
@inproceedings{pazienza_ai/nlp_2005, address = {Heidelberg},
series = {Lecture notes in computer science; 3533},
title = {{AI/NLP} technologies applied to spacecraft mission design},
abstract = {In this paper we propose the model of a prototypical {NLP} architecture of an information access system to support a team of experts in a scientific design task, in a shared and heterogeneous framework. Specifically, we believe {AI/NLP} can be helpful in several tasks, such as the extraction of implicit information needs enclosed in meeting minutes or other documents, analysis of explicit information needs expressed through Natural Language, processing and indexing of document collections, extraction of required information from documents, modeling of a common knowledge base, and, finally, identification of important concepts through the automatic extraction of terms. In particular, we envisioned this architecture in the specific and practical scenario of the Concurrent Design Facility {(CDF)} of the European Space Agency {(ESA),} in the framework of the {SHUMI} project {(Support} To {HUman} Machine Interaction) developed in collaboration with the {ESA/ESTEC} - {ACT} {(Advanced} Concept Team). © {Springer-Verlag} Berlin Heidelberg 2005.},
booktitle = {Innovations in applied artificial intelligence : 18th international conference on industrial and engineering applications of artificial intelligence and expert systems, {IEA/AIE} 2005, Bari, Italy, june 22-24, 2005 : proceedings},
publisher = {Springer Verlag},
author = {Maria Teresa Pazienza and Marco Pennacchiotti and Michele Vindigni and Fabio Massimo Zanzotto},
year = {2005},
keywords = {Extraction d'information, Indexation},
pages = {239--248},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 06179837202 {AI/NLP} technologies Spacecraft mission design Concurrent Design Facility {(CDF)} European Space Agency {(ESA){\textless}/p{\textgreater}}} },
-
D. Downey, O. Etzioni, and S. Soderland, "A probabilistic model of redundancy in information extraction," Procs. of IJCAI, vol. 2005, 2005.
@article{downey_probabilistic_2005, title = {A probabilistic model of redundancy in information extraction},
volume = {2005},
journal = {Procs. of {IJCAI}},
author = {D. Downey and O. Etzioni and S. Soderland},
year = {2005},
keywords = {Approche probabiliste, Extraction d'information},
annote = {{{\textless}p{\textgreater}downeyDoug2005.pdf{\textless}/p{\textgreater}}} },
-
N. Ireson, F. Ciravegna, M. E. Califf, D. Freitag, N. Kushmerick, and A. Lavelli, "Evaluating machine learning for information extraction," ACM International Conference Proceeding Series, vol. 119, pp. 345-352, 2005.
@article{ireson_evaluating_2005, title = {Evaluating machine learning for information extraction},
volume = {119},
journal = {{ACM} International Conference Proceeding Series},
author = {N. Ireson and F. Ciravegna and M. E. Califf and D. Freitag and N. Kushmerick and A. Lavelli},
year = {2005},
keywords = {Apprentissage machine, Extraction d'information},
pages = {345--352} },
-
M. Koppel, J. Schler, and K. Zigdon, "Determining an author’s native language by mining a text for errors," , Chicago, Illinois, USA, 2005, pp. 624-628.
@inproceedings{koppel_determiningauthors_2005, address = {Chicago, Illinois, {USA}},
title = {Determining an author's native language by mining a text for errors},
isbn = {{1-59593-135-X}},
url = {http://portal.acm.org/ft_gateway.cfm?id=1081947&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1081870.1081947},
abstract = {In this paper, we show that stylistic text features can be exploited to determine an anonymous author's native language with high accuracy. Specifically, we first use automatic tools to ascertain frequencies of various stylistic idiosyncrasies in a text. These frequencies then serve as features for support vector machines that learn to classify texts according to author native language.},
publisher = {{ACM}},
author = {Moshe Koppel and Jonathan Schler and Kfir Zigdon},
year = {2005},
keywords = {Extraction d'information, Fouille de texte},
pages = {624--628},
annote = {{{\textless}p{\textgreater}koppelMoshe2005.pdf{\textless}/p{\textgreater}}} },
-
P. Lutsky, "Lexical semantics domain model for information extraction," , San Jose, CA, United States, 2004, pp. 86-88.
@inproceedings{lutsky_lexical_2004, address = {San Jose, {CA,} United States},
series = {{AAAI} Workshop - Technical Report},
title = {Lexical semantics domain model for information extraction},
volume = {{WS-04-01}},
abstract = {The domain of operating system reference manuals uses linguistic constructs that are difficult to process since many terms are similar and most concepts are abstract software engineering constructs. The {SIFT} system for automatic test generation from these documents uses a natural-language-based formalism for software domain models. The formalism is based on the generative lexicon framework {(Pustejovsky} 1995). Examples show how this model is used for information extraction from texts in the software engineering domain.},
publisher = {American Association for Artificial Intelligence, Menlo Park, {CA} 94025-3496, United States},
author = {Patricia Lutsky},
year = {2004},
keywords = {Extraction d'information},
pages = {86--88} },
-
E. Morin, "Automatic acquisition and expansion of hypernym links," Computers and the Humanities, vol. 38, pp. 363-396, 2004.
@article{morin_automatic_2004, title = {Automatic acquisition and expansion of hypernym links},
volume = {38},
url = {http://www.ingentaconnect.com/content/klu/1574020x/2004/00000038/00000004/00001926},
abstract = {Recent developments in computational terminology call for the design of multiple and complementary tools for the acquisition, the structuring and the exploitation of terminological data. This paper proposes to bridge the gap between term acquisition and thesaurus construction by offering a framework for automatic structuring of multi-word candidate terms with the help of corpus-based links between single-word terms. First, we present a system for corpus-based acquisition of terminological relationships through discursive patterns. This system is built on previous work on automatic extraction of hyponymy links through shallow parsing. Second, we show how hypernym links between single-word terms can be extended to semantic links between multi-word terms through corpus-based extraction of semantic variants. The induced hierarchy is incomplete but provides an automatic generalization of single-word terms relations to multi-word terms that are pervasive in technical thesauri and corpora.},
journal = {Computers and the Humanities},
author = {Emmanuel Morin},
year = {2004},
keywords = {Extraction d'information, Web},
pages = {363--396} },
-
L. Zhou, "Automating linguistics-based cues for detecting deception in text-based asynchronous computer-mediated communications," Group Decision and Negotiation, vol. 13, iss. 1, pp. 81-106, 2004.
@article{zhou_automating_2004, title = {Automating linguistics-based cues for detecting deception in text-based asynchronous computer-mediated communications},
volume = {13},
url = {http://www.ingentaconnect.com/content/klu/grup/2004/00000013/00000001/05256106},
abstract = {The detection of deception is a promising but challenging task. A systematic discussion of automated Linguistics Based Cues {(LBC)} to deception has rarely been touched before. The experiment studied the effectiveness of automated {LBC} in the context of text-based asynchronous computer mediated communication {(TA-CMC).} Twenty-seven cues either extracted from the prior research or created for this study were clustered into nine linguistics constructs: quantity, diversity, complexity, specificity, expressivity, informality, affect, uncertainty, and nonimmediacy. A test of the selected {LBC} in a simulated {TA-CMC} experiment showed that: (1) a systematic analysis of linguistic information could be useful in the detection of deception; (2) some existing {LBC} were effective as expected, while some others turned out in the opposite direction to the prediction of the prior research; and (3) some newly discovered linguistic constructs and their component {LBC} were helpful in differentiating deception from truth.},
number = {1},
journal = {Group Decision and Negotiation},
author = {L. Zhou},
year = {2004},
keywords = {Extraction d'information, Linguistique},
pages = {81--106} },
-
J. Makkonen, H. Ahonen-Myka, and M. Salmenkivi, "Simple semantics in topic detection and tracking : special issue on ECIR," Information Retrieval, vol. 7, pp. 347-368, 2004.
@article{makkonen_simple_2004, title = {Simple semantics in topic detection and tracking : special issue on {ECIR}},
volume = {7},
url = {http://www.ingentaconnect.com/content/klu/inrt/2004/00000007/F0020003/05264860},
abstract = {Topic Detection and Tracking {(TDT)} is a research initiative that aims at techniques to organize news documents in terms of news events. We propose a method that incorporates simple semantics into {TDT} by splitting the term space into groups of terms that have the meaning of the same type. Such a group can be associated with an external ontology. This ontology is used to determine the similarity of two terms in the given group. We extract proper names, locations, temporal expressions and normal terms into distinct sub-vectors of the document representation. Measuring the similarity of two documents is conducted by comparing a pair of their corresponding sub-vectors at a time. We use a simple perceptron to optimize the relative emphasis of each semantic class in the tracking and detection decisions. The results suggest that the spatial and the temporal similarity measures need to be improved. Especially the vagueness of spatial and temporal terms needs to be addressed.},
journal = {Information Retrieval},
author = {J. Makkonen and H. {Ahonen-Myka} and M. Salmenkivi},
year = {2004},
keywords = {Extraction d'information, Ontologie},
pages = {347--368} },
-
R. M. Arevalo, "MICE : a module for named entities recognition and classification," International Journal of Corpus Linguistics, vol. 9, pp. 53-68, 2004.
@article{montserrat_arevalo_mice_2004, title = {{MICE} : a module for named entities recognition and classification},
volume = {9},
url = {http://www.ingentaconnect.com/content/jbp/ijcl/2004/00000009/00000001/art00003},
abstract = {In the field of corpus linguistics, Named Entity treatment includes the recognition and classification of different types of discursive elements like proper names, date, time, etc. These discursive elements play an important role in different Natural Language Processing applications and techniques such as Information Retrieval, Information Extraction, translations memories, document routers, etc.},
journal = {International Journal of Corpus Linguistics},
author = {Rodriguez Montserrat Arevalo},
year = {2004},
keywords = {Extraction d'information, Recherche d'information},
pages = {53--68} },
-
S. I. C. Cruz, "Analyse de la variation terminologique en corpus parallèle anglais-espagnol et de son incidence sur l’extraction de termes bilingues," PhD Thesis , 2004.
@phdthesis{carreno_cruz_analyse_2004, type = {Thèse {(M.} A.)},
title = {Analyse de la variation terminologique en corpus parallèle anglais-espagnol et de son incidence sur l’extraction de termes bilingues},
school = {Université de Montréal},
author = {Sahara Iveth Carreno Cruz},
year = {2004},
keywords = {Extraction d'information},
pages = {129 f.},
annote = {{{\textless}p{\textgreater}carreno\_cruzSahara2005.zip{\textless}/p{\textgreater}}} },
-
P. Drouin, "Review of natural language processing for online applications : text retrieval, extraction and categorization by Peter Jackson and Isabelle Moulinier," Terminology, vol. 10, iss. 1, pp. 177-179, 2004.
@article{drouin_review_2004, title = {Review of natural language processing for online applications : text retrieval, extraction and categorization by Peter Jackson and Isabelle Moulinier},
volume = {10},
url = {http://www.ingentaconnect.com/content/jbp/term/2004/00000010/00000001/art00010},
number = {1},
journal = {Terminology},
author = {Patrick Drouin},
year = {2004},
keywords = {Catégorisation, Extraction d'information, Langage naturel, Recherche d'information, Web},
pages = {177--179} },
-
Jae-Woo and Doo-Kwon, "A model for extracting keywords of document using term frequency and distribution," in Computational linguistics and intelligent text processing : 5th international conference, CICLing 2004 Seoul, Korea, february 15-21, 2004 : proceedings, Berlin; Heidelberg, 2004, pp. 437-440.
@inproceedings{lee_model_2004, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 2945},
title = {A model for extracting keywords of document using term frequency and distribution},
isbn = {9783540210061},
abstract = {In information retrieval systems, it is very important that indexing is defined very well by appropriate terms about documents. In this paper, we propose a simple retrieval model based on terms distribution characteristics besides term frequency in documents. We define the keywords distribution characteristics using a statistics, standard deviation. We can extract document keywords that term frequency is great and standard deviation is great. And if term frequency is great and standard deviation is small, the terms can be defined as paragraph keywords. Applying our proposed retrieval model we can search many documents or knowledge using the document keywords and paragraph keywords.},
booktitle = {Computational linguistics and intelligent text processing : 5th international conference, {CICLing} 2004 Seoul, Korea, february 15-21, 2004 : proceedings},
publisher = {Springer},
author = {{Jae-Woo} Lee and {Doo-Kwon} Baik},
year = {2004},
keywords = {Extraction d'information, Recherche d'information},
pages = {437--440},
annote = {{{\textless}p{\textgreater}leeJae-woo2004.pdf{\textless}/p{\textgreater}}} },
-
C. Wen-Tao, W. Sheng-Rui, and J. Qing-Shan, "Address extraction: a graph matching and ontology-based approach to conceptual information retrieval," , Shanghai, China, 2004, pp. 1571-6.
@inproceedings{wen-tao_address_2004, address = {Shanghai, China},
series = {Proceedings of 2004 International Conference on Machine Learning and Cybernetics {(IEEE} Cat. {No.04EX826)}},
title = {Address extraction: a graph matching and ontology-based approach to conceptual information retrieval},
volume = {vol.3},
abstract = {Address and related location-awareness information can be retrieved and extracted from the Web using content-based {IR} technologies. Much recent research on content-based information retrieval focuses on conceptual analysis of unstructured texts on the Web. This paper illustrates an address extraction application to achieve an ontology-based conceptual {IR} system with graph matching. Our key idea is that a document can be represented as a sub-graph of a predefined ontology graph. An approximate graph matching approach is used for content (address) extraction. This work is part of an ongoing project to develop an intelligent search agent to support driving-related information extraction from the Web},
publisher = {{IEEE}},
author = {Cai {Wen-Tao} and Wang {Sheng-Rui} and Jiang {Qing-Shan}},
year = {2004},
note = {Copyright 2005, {IEE}},
keywords = {Extraction d'information, Intelligence artificielle, Ontologie},
pages = {1571--6},
annote = {{\textless}p{\textgreater}8262480 address extraction ontology graph matching conceptual information retrieval Web content based information retrieval intelligent search agent{\textless}/p{\textgreater}} },
-
H. Han, "Learning rules for conceptual structure on the Web : special issue on Web content mining," Journal of Intelligent Information Systems, vol. 22, iss. 3, pp. 237-256, 2004.
@article{han_learning_2004, title = {Learning rules for conceptual structure on the Web : special issue on Web content mining},
volume = {22},
url = {http://www.ingentaconnect.com/content/klu/jiis/2004/00000022/00000003/05271105},
abstract = {This paper presents an infrastructure and methodology to extract conceptual structure from Web pages, which are mainly constructed by {HTML} tags and incomplete text. Human beings can easily read Web pages and grasp an idea about the conceptual structure of underlying data, but cannot handle excessive amounts of data due to lack of patience and time. However, it is extremely difficult for machines to accurately determine the content of Web pages due to lack of understanding of context and semantics. Our work provides a methodology and infrastructure to process Web data and extract the underlying conceptual structure, in particular relationships between ontological concepts using Inductive Logic Programming in order to help with automating the processing of the excessive amount of Web data by capturing its conceptual structures.},
number = {3},
journal = {Journal of Intelligent Information Systems},
author = {H. Han},
year = {2004},
keywords = {Découverte de connaissances, Extraction d'information, Ontologie},
pages = {237--256} },
-
Sana-Leila, "Extraction de segments thématiques pour la construction de résumé multi-document orienté par un profil utilisateur," , Batz-sur-Mer, France, 2003.
-
P. Drouin, Extraction de termes : présentation de diverses approches, 2003.
-
H. Alani, S. Kim, D. E. Millard, M. J. Weal, W. Hall, P. H. Lewis, and N. R. Shadbolt, "Automatic ontology-based knowledge extraction and Tailored biography generation from Web documents," IEEE Intelligent Systems, vol. 18, iss. 1, pp. 14-21, 2003.
@article{alani_automatic_2003, title = {Automatic ontology-based knowledge extraction and Tailored biography generation from Web documents},
volume = {18},
issn = {15411672},
doi = {10.1.1.57.9194},
abstract = {This paper presents recent developments in the Artequakt project which seeks to automatically extract knowledge about artists from the Web, populate a knowledge base, and use it to generate personalized narrative biographies. An overview of the system architecture is presented and the three key components of that architecture are explained in detail, namely knowledge extraction, information management and biography construction. An example experiment is detailed and further challenges are outlined.},
number = {1},
journal = {{IEEE} Intelligent Systems},
author = {Harith Alani and Sanghee Kim and David E Millard and Mark J Weal and Wendy Hall and Paul H Lewis and Nigel R Shadbolt},
year = {2003},
keywords = {Extraction d'information, Ontologie},
pages = {14--21},
annote = {{{\textless}p{\textgreater}alaniHarith2003.pdf{\textless}/p{\textgreater}}} },
-
M. R. Harris, G. K. Savova, T. M. Johnson, and C. G. Chute, "A term extraction tool for expanding content in the domain of functioning, disability, and health : proof of concept," Journal of Biomedical Informatics, vol. 36, pp. 250-259, 2003.
@article{harris_term_2003, title = {A term extraction tool for expanding content in the domain of functioning, disability, and health : proof of concept},
volume = {36},
url = {http://www.ingentaconnect.com/content/els/15320464/2003/00000036/00000004/art00086},
abstract = {Among the challenges in developing terminology systems is providing complete content coverage of specialized subject fields. This paper reports on a term extraction tool designed for the development and expansion of terminology systems concerned with functioning, disability, and health. Content relevant to this domain is the emphasis of the foci and targets of many nursing terminologies. We extend previously published term extraction algorithms by applying two filters. The first filter is based on the raw frequency of the content words in the lexical string under consideration. The second filter applies the notion of a complete syntactic node to discover relevant noun or verb phrases. While we report on a limited corpus (30,607 words comprising 4103 terms from 60 dismissal note summaries), the recall, precision, and F-measures we observed are encouraging and suggest continued development and testing of the tool is merited.},
journal = {Journal of Biomedical Informatics},
author = {M. R. Harris and G. K. Savova and T. M. Johnson and C. G. Chute},
year = {2003},
keywords = {Extraction d'information},
pages = {250--259},
annote = {{{\textless}p{\textgreater}Terminologie} Nursing{\textless}/p{\textgreater}} },
-
T. C. Rindflesch and M. Fiszman, "The interaction of domain knowledge and linguistic structure in natural language processing: interpreting hypernymic propositions in biomedical text," Journal of Biomedical Informatics, vol. 36, iss. 6, pp. 462-477, 2003.
@article{rindflesch_interaction_2003, title = {The interaction of domain knowledge and linguistic structure in natural language processing: interpreting hypernymic propositions in biomedical text},
volume = {36},
url = {http://www.sciencedirect.com/science?_ob=GatewayURL&_method=citationSearch&_uoikey=B6WHD-4BFP9GY-2&_origin=SDEMFRASCII&_version=1&md5=c0f3c4550d31fa1e5700b21f1f62183b},
abstract = {Interpretation of semantic propositions in free-text documents such as {MEDLINE} citations would provide valuable support for biomedical applications, and several approaches to semantic interpretation are being pursued in the biomedical informatics community. In this paper, we describe a methodology for interpreting linguistic structures that encode hypernymic propositions, in which a more specific concept is in a taxonomic relationship with a more general concept. In order to effectively process these constructions, we exploit underspecified syntactic analysis and structured domain knowledge from the Unified Medical Language System {(UMLS).} After introducing the syntactic processing on which our system depends, we focus on the {UMLS} knowledge that supports interpretation of hypernymic propositions. We first use semantic groups from the Semantic Network to ensure that the two concepts involved are compatible; hierarchical information in the Metathesaurus then determines which concept is more general and which more specific. A preliminary evaluation of a sample based on the semantic group Chemicals and Drugs provides 83\% precision. An error analysis was conducted and potential solutions to the problems encountered are presented. The research discussed here serves as a paradigm for investigating the interaction between domain knowledge and linguistic structure in natural language processing, and could also make a contribution to research on automatic processing of discourse structure. Additional implications of the system we present include its integration in advanced semantic interpretation processors for biomedical text and its use for information extraction in specific domains. The approach has the potential to support a range of applications, including information retrieval and ontology engineering.},
number = {6},
journal = {Journal of Biomedical Informatics},
author = {Thomas C. Rindflesch and Marcelo Fiszman},
year = {2003},
keywords = {Extraction d'information, Langage naturel},
pages = {462--477} },
-
M. Rossignol and P. Sébillot, "Extraction statistique sur corpus de classes de mots-clés thématiques," Traitement automatique des langues, vol. 44, iss. 33, pp. 217-246, 2003.
-
R. Feldman, Y. Regev, E. Hurvitz, and M. Finkelstein-Landau, "Mining the biomedical literature using semantic analysis and natural language processing techniques," BIOSILICO, vol. 1, iss. 2, pp. 69-80, 2003.
@article{feldman_miningbiomedical_2003, title = {Mining the biomedical literature using semantic analysis and natural language processing techniques},
volume = {1},
url = {http://www.sciencedirect.com/science/article/B75GS-4BRJ67W-K/2/ca32b1c1a6689b513b51756be617b98d},
abstract = {The information age has made the electronic storage of large amounts of data effortless. The proliferation of documents available on the Internet, corporate intranets, news wires and elsewhere is overwhelming. Search engines only exacerbate this overload problem by making increasingly more documents available in only a few keystrokes. This information overload also exists in the biomedical field, where scientific publications, and other forms of text-based data are produced at an unprecedented rate. Text mining is the combined, automated process of analyzing unstructured, natural language text to discover information and knowledge that are typically difficult to retrieve. Here, we focus on text mining as applied to the biomedical literature. We focus in particular on finding relationships among genes, proteins, drugs and diseases, to facilitate an understanding and prediction of complex biological processes. The {LitMiner(TM)} system, developed specifically for this purpose; is described in relation to the Knowledge Discovery and Data Mining Cup 2002, which serves as a formal evaluation of the system.},
number = {2},
journal = {{BIOSILICO}},
author = {Ronen Feldman and Yizhar Regev and Eyal Hurvitz and Michal {Finkelstein-Landau}},
year = {2003},
keywords = {Extraction d'information, Indexation},
pages = {69--80} },
-
B. Jun-Peng, S. Jun-Yi, L. Xiao-Dong, and S. Qin-Bao, "A new text feature extraction model and its application in document copy detection," , Xi’an, China, 2003, pp. 82-7.
@inproceedings{jun-peng_new_2003, address = {Xi'an, China},
series = {Proceedings of the 2003 International Conference on Machine Learning and Cybernetics {(IEEE} Cat. {No.03EX693)}},
title = {A new text feature extraction model and its application in document copy detection},
volume = {Vol.1},
url = {http://dx.doi.org/10.1109/ICMLC.2003.1264447},
abstract = {Text feature extraction is a common issue in information retrieval, text mining, Web mining, text classification/clustering and document copy etc. The most popular approach is word frequency based scheme, which uses a word frequency vector to represent a document. Cosine function, dot product and proportion function are regular similarity measures of vector. But that is only global semantic feature of a document and loses local feature and structural information so that it prevents us to distinguish text well, especially in copy detection. In this paper we present a new text feature extraction model: semantic sequence model {(SSM)} that based on the concepts of word distance, word density and semantic sequence. The semantic sequences of a document contain not only local semantic features but also global feature and structural information, on which we get excellent accuracy of text copy detection. At the end of the paper, we contrast {SSM} with {VSM} and {RFM} and the experimental results show {SSM} is a superior model},
publisher = {{IEEE}},
author = {Bao {Jun-Peng} and Shen {Jun-Yi} and Liu {Xiao-Dong} and Song {Qin-Bao}},
year = {2003},
note = {Copyright 2004, {IEE}},
keywords = {Extraction d'information, Recherche d'information},
pages = {82--7},
annote = {{\textless}p{\textgreater}7953930 text feature extraction model document copy detection word frequency based scheme word frequency vector cosine function dot product proportion function global semantic feature semantic sequence model word distance word density local semantic features text copy detection information retrieval plagiarism probability{\textless}/p{\textgreater}} },
-
K. N. Sgarbas, G. E. Londos, N. D. Fakotakis, and G. K. Kokkinakis, "The WATCHER project : building an agent for automatic extraction of language resources from the Internet," Literary and linguistic computing, vol. 18, iss. 4, pp. 449-464, 2003.
@article{sgarbas_watcher_2003, title = {The {WATCHER} project : building an agent for automatic extraction of language resources from the Internet},
volume = {18},
shorttitle = {The {WATCHER} project},
doi = {10.1093/llc/18.4.449},
abstract = {The {WATCHER} project aims to automate the extraction of language resources from the Internet via an intelligent agent called the {WATCHER'.} This agent (in its final form) will be able to actively search and collect subject-specific and language-specific texts and build corpora and lexicons from them. Although the resources will still have to be checked for validity after their collection, the proposed method requires the minimum of human interaction. Apart from its ability to collect these resources automatically, the {WATCHER} will also be able to track the evolution of a target language over time by collecting resources annually and presenting their analysis in annual reports. The {WATCHER} is still under development. This paper presents an overview of its architecture and functionality, and reports recent progress.},
number = {4},
journal = {Literary and linguistic computing},
author = {Kyriakos N. Sgarbas and George E. Londos and Nikos D. Fakotakis and George K. Kokkinakis},
month = nov, year = {2003},
keywords = {Extraction d'information, Web},
pages = {449--464},
annote = {{{\textless}p{\textgreater}sgarbasKyruakos2003.pdf{\textless}/p{\textgreater}}} },
-
C. Gieger, H. Deneke, and J. Fluck, "The future of text mining in genome-based clinical research," Biosilico, vol. 1, iss. 3, pp. 97-102, 2003.
@article{gieger_future_2003, title = {The future of text mining in genome-based clinical research},
volume = {1},
url = {http://www.sciencedirect.com/science/article/B75GS-4BNT1YB-6/2/7e5d406bacd2769e5b2a33be958adb5f},
abstract = {Efficient information retrieval and extraction is a major challenge in molecular biology and genome-based clinical research. In addition, there is an increasing demand to combine information from different resources and across different disciplines in life sciences. Unfortunately, a large proportion of this information is only available in scientific articles. Moreover, the volume of literature is growing almost exponentially. Text mining provides methods to retrieve and extract information contained in free-text automatically. Here, we discuss the challenges and limitations of text mining in biology and medicine, including unsolved problems and necessary developments.},
number = {3},
journal = {Biosilico},
author = {Christian Gieger and Hartwig Deneke and Juliane Fluck},
year = {2003},
keywords = {Extraction d'information, Fouille de texte, Ontologie, Recherche d'information},
pages = {97--102} },
-
P. Drouin, "Term extraction using non-technical corpora as a point of leverage," Terminology, vol. 9, iss. 1, pp. 99-115, 2003.
@article{drouin_term_2003, title = {Term extraction using non-technical corpora as a point of leverage},
volume = {9},
abstract = {This paper describes a new hybrid term extraction technique for technical corpora. Our main goal is to reduce the amount of noise in the list of candidate terms by restricting the lexical items that can appear inside candidate terms. In order to do so, we base our term extraction process on lexical items selected by a statistical test that targets items that are highly specific to the technical corpus being analyzed.},
number = {1},
journal = {Terminology},
author = {P. Drouin},
year = {2003},
keywords = {Extraction d'information},
pages = {99--115} },
-
T. Jo, "Neural based approach to keyword extraction from documents," in Computational Science and Its Applications — ICCSA 2003 :, Berlin; Heidelberg, 2003, pp. 456-461.
@inproceedings{jo_neural_2003, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 2667},
title = {Neural based approach to keyword extraction from documents},
isbn = {3540401555, 9783540401551},
abstract = {Documents are unstructured data consisting of natural language. Document surrogate means the structured data converted from original documents to process them in computer systems. Document surrogate is usually represented into a list of words. Because not all words in a document reflect its content, it is necessary to select imp ortant words related with its content among them. Such important words are called keywords and they are selected with a particular equation based on {TF} {(Term} Frequency) and {IDF} (inverted Document Frequency). Actually, not only {TF} and {IDF} but also the position of each word in the document and the inclusion of the word in the title should be considered to select keywords among words contained in the text. The equation based on these factors gets too complicate to be applied to the selection of keywords. This paper proposes the neural network model, back propagation, in which these factors are used as the features and feature vectors are generated, and with which keywords are selected. This paper will show that backpropagation outperforms the equation in distinguishing keywords.},
booktitle = {Computational Science and Its Applications — {ICCSA} 2003 :},
publisher = {Springer},
author = {Taeho Jo},
year = {2003},
keywords = {Extraction d'information, Réseau de neurones},
pages = {456--461},
annote = {{{\textless}p{\textgreater}joTaeho2003.pdf{\textless}/p{\textgreater}}} },
-
Jeong-Ho, J. W. Lee, Y. Kim, and Byoung-Tak, "Topic extraction from text documents using multiple-cause networks," in PRICAI 2002 : trends in artificial intelligence : 7th Pacific Rim international conference on artificial intelligence Tokyo, Japan, august 18–22, 2002 : proceedings, Berlin; Heidelberg, 2002, pp. 399-406.
@inproceedings{chang_topic_2002, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science},
title = {Topic extraction from text documents using multiple-cause networks},
volume = {2417},
url = {http://dx.doi.org/10.1007/3-540-45683-X_47},
abstract = {This paper presents an approach to the topic extraction from text documents using probabilistic graphical models. Multiple-cause networks with latent variables are used and the Helmholtz machines are utilized to ease the learning and inference. The learning in this model is conducted in a purely data-driven way and does not require prespecified categories of the given documents. Topic words extraction experiments on the {TDT-2} collection are presented. Especially, document clustering results on a subset of {TREC-8} ad-hoc task data show the substantial reduction of the inference time without significant deterioration of performance.},
booktitle = {{PRICAI} 2002 : trends in artificial intelligence : 7th Pacific Rim international conference on artificial intelligence Tokyo, Japan, august 18–22, 2002 : proceedings},
publisher = {Springer},
author = {{Jeong-Ho} Chang and Jae Won Lee and Yuseop Kim and {Byoung-Tak} Zhang},
year = {2002},
keywords = {Classification, Extraction d'information},
pages = {399--406},
annote = {{{\textless}p{\textgreater}changJeong-ho2002.pdf{\textless}/p{\textgreater}}} },
-
G. Nenadic, H. Mima, I. Spasic, S. Ananiadou, and J. Tsujii, "Terminology-driven literature mining and knowledge acquisition in biomedicine," International Journal of Medical Informatics, vol. 67, iss. 1-3, pp. 33-48, 2002.
@article{nenadic_terminology-driven_2002, title = {Terminology-driven literature mining and knowledge acquisition in biomedicine},
volume = {67},
url = {http://www.sciencedirect.com/science/article/B6T7S-47736Y2-6/2/aed749747a1f91ba1e8fb1bc5fdd003f},
abstract = {In this paper we describe Tagged Information Management System {(TIMS),} an integrated knowledge management system for the domain of molecular biology and biomedicine, in which terminology-driven literature mining, knowledge acquisition {(KA),} knowledge integration {(KI),} and -based knowledge retrieval are combined using tag information management and ontology inference. The system integrates automatic terminology acquisition, term variation management, hierarchical term clustering, tag-based information extraction {(IE),} and ontology-based query expansion. {TIMS} supports introducing and combining different types of tags (linguistic and domain-specific, manual and automatic). Tag-based interval operations and a query language are introduced in order to facilitate {KA} and retrieval from documents. Through {KA} examples, we illustrate the way in which literature mining techniques can be utilised for knowledge discovery from documents.},
number = {1-3},
journal = {International Journal of Medical Informatics},
author = {Goran Nenadic and Hideki Mima and Irena Spasic and Sophia Ananiadou and Jun-ichi Tsujii},
year = {2002},
keywords = {Extraction d'information},
pages = {33--48} },
-
A. Maedche, G. Neumann, S. Staab, and J. Kacprzyk, "Bootstrapping an ontology based information extraction system." Springer, 2002.
@incollection{maedche_bootstrappingontology_2002, series = {Studies in fuzziness and soft computing; 111},
title = {Bootstrapping an ontology based information extraction system},
url = {http://citeseer.ist.psu.edu/maedche02bootstrapping.html},
abstract = {Automatic intelligent web exploration will benefit from shallow information extraction techniques if the latter can be brought to work within many different domains. The major bottleneck for this, however, lies in the so far difficult and expensive modeling of lexical knowledge, extraction rules, and an ontology that together define the information extraction system. In this paper we present a bootstrapping approach that allows for the fast creation of an ontology-based information extracting system relying on several basic components, viz. a core information extraction system, an ontology engineering environment and an inference engine. We make extensive use of machine learning techniques to support the semi-automatic, incremental bootstrapping of the domain-specific target information extraction system.},
booktitle = {Intelligent exploration of the Web},
publisher = {Springer},
author = {A. Maedche and G. Neumann and S. Staab and J. Kacprzyk},
year = {2002},
keywords = {Apprentissage machine, Extraction d'information, Ontologie} },
-
Yu-Sheng and Chung-Hsien, "Meaningful term extraction and discriminative term selection in text categorization via unknown-word methodology," ACM transactions on asian language information processing, vol. 1, iss. 1, pp. 34-64, 2002.
@article{lai_meaningful_2002, title = {Meaningful term extraction and discriminative term selection in text categorization via unknown-word methodology},
volume = {1},
issn = {1530-0226},
url = {http://portal.acm.org/ft_gateway.cfm?id=595579&type=pdf&coll=GUIDE&dl=GUIDE&CFID=4944394&CFTOKEN=15881842},
doi = {10.1145/595576.595579},
abstract = {In this article, an approach based on unknown words is proposed for meaningful term extraction and discriminative term selection in text categorization. For meaningful term extraction, a phrase-like unit {(PLU)-based} likelihood ratio is proposed to estimate the likelihood that a word sequence is an unknown word. On the other hand, a discriminative measure is proposed for term selection and is combined with the {PLU-based} likelihood ratio to determine the text category. We conducted several experiments on a news corpus, called {MSDN.} The {MSDN} corpus is collected from an online news Website maintained by the {Min-Sheng} Daily News, Taiwan. The corpus contains 44,675 articles with over 35 million words. The experimental results show that the system using a simple classifier achieved 95.31\% accuracy. When using a state-of-the-art classifier, {kNN,} the average accuracy is 96.40\%, outperforming all the other systems evaluated on the same collection, including the traditional term-word by {kNN} (88.52\%); sleeping-experts (82.22\%); sparse phrase by four-word sleeping-experts (86.34\%); and Boolean combinations of words by {RIPPER} (87.54\%). A proposed purification process can effectively reduce the dimensionality of the feature space from 50,576 terms in the word-based approach to 19,865 terms in the unknown word-based approach. In addition, more than 80\% of automatically extracted terms are meaningful. Experiments also show that the proportion of meaningful terms extracted from training data is relative to the classification accuracy in outside testing.},
number = {1},
journal = {{ACM} transactions on asian language information processing},
author = {{Yu-Sheng} Lai and {Chung-Hsien} Wu},
year = {2002},
keywords = {Catégorisation, Extraction d'information},
pages = {34--64},
annote = {{{\textless}p{\textgreater}laiYu-sheng2002.pdf{\textless}/p{\textgreater}}} },
-
H. Fulford, "Exploring terms and their linguistic environment in text : a domain-independent approach to automated term extraction," Terminology, vol. 7, iss. 2, pp. 259-279, 2002.
@article{fulford_exploring_2002, title = {Exploring terms and their linguistic environment in text : a domain-independent approach to automated term extraction},
volume = {7},
url = {http://www.ingentaconnect.com/content/jbp/term/2002/00000007/00000002/art00007},
abstract = {The proliferation of specialist texts over recent decades has exacerbated the need for term extraction software to assist terminologists in compiling terminology collections. To this end, an automated approach to English term extraction is presented, which, in keeping with the multidisciplinary working environments of many contemporary terminologists, is designed to be domain independent. Based on observations made of the linguistic features of terms and their linguistic environment in text, this approach identifies single- and multi-word terms spanning a range of word classes. An implementation of the approach (denoted {‘Textprobe’)} is described and evaluated by measuring its term extraction efficiency against the manual scanning output of both domain experts and terminologists. Results obtained in the evaluation suggest that a high proportion of single-and multi-word terms can successfully be extracted from special language texts. It is anticipated that the approach will be portable to other European languages.},
number = {2},
journal = {Terminology},
author = {H. Fulford},
year = {2002},
keywords = {Extraction d'information},
pages = {259--279} },
-
I. Mani, Automatic summarization, Amsterdam: John Benjamins, 2001.
@book{mani_automatic_2001, address = {Amsterdam},
title = {Automatic summarization},
abstract = {With the explosion in the quantity of on-line text and multimedia information in recent years, there has been a renewed interest in automatic summarization. This book provides a systematic introduction to the field, explaining basic definitions, the strategies used by human summarizers, and automatic methods that leverage linguistic and statistical knowledge to produce extracts and abstracts. Drawing from a wealth of research in artificial intelligence, natural language processing, and information retrieval, the book also includes detailed assessments of evaluation methods and new topics such as multi-document and multimedia summarization. Previous automatic summarization books have been either collections of specialized papers, or else authored books with only a chapter or two devoted to the field as a whole. This is the first textbook on the subject, developed based on teaching materials used in two one-semester courses. To further help the student reader, the book includes detailed case studies, accompanied by end-of-chapter reviews and an extensive glossary.},
publisher = {John Benjamins},
author = {Inderjeet Mani},
year = {2001},
keywords = {Extraction d'information} },
-
Massih-Réza, "Apprentissage automatique et recherche de l’information : application à l’extraction d’information de surface et au résumé du texte," PhD Thesis , 2001.
@phdthesis{amini_apprentissage_2001, type = {Thèse de doctorat},
title = {Apprentissage automatique et recherche de l'information : application à l’extraction d’information de surface et au résumé du texte},
school = {Université Paris 6},
author = {{Massih-Réza} Amini},
year = {2001},
keywords = {Extraction d'information, Recherche d'information},
pages = {209 p.},
annote = {{{\textless}p{\textgreater}aminiMassih-Reza2001.pdf{\textless}/p{\textgreater}}} },
-
T. C. Jo and J. Seo, "Text categorization based on multiple keyword extraction," in The proceedings of applied informatics, 2001, 2001, pp. 95-100.
@inproceedings{jo_text_2001, title = {Text categorization based on multiple keyword extraction},
abstract = {Text categorization is the process of assigning a category or categories to the given document, among the predefined categories. The systems, such as {KMS} {(Knowledge} Management System), {DMS} {(Document} Management System), {DLS} {(Digital} Library System), and {IRS} {(Information} Retrieval System), require the functions of categorizing documents automatically. These systems store and deal with textual data. These data should be stored with being classified responding to its own category. This paper proposes that terms are selected and extracted with multiple techniques, instead of single technique. Multiple techniques of extracting terms are expected to generate less misclassified terms than single technique. The algorithm of selecting and extracting terms from a document is called term extractor. The group of several different algorithms is called the committee of term extractor. Therefore terms are extracted from the document with the committee of term extractor, not with a single term extractor. The committee of term extractors is composed by several coupling schemes. The group of terms generated from a particular term extractor is called term set. Coupling schemes are intersection, union, and voting.},
booktitle = {The proceedings of applied informatics, 2001},
author = {Taeho C. Jo and Jerry Seo},
year = {2001},
keywords = {Catégorisation, Extraction d'information},
pages = {95--100},
annote = {{{\textless}p{\textgreater}joTaeho2001.pdf{\textless}/p{\textgreater}}} },
-
T. Poibeau, "An open architecture for multi-domain information extraction," , Seattle, WA, 2001, pp. 81-86.
@inproceedings{poibeau_open_2001, address = {Seattle, {WA}},
series = {Innovative Applications of Artificial Intelligence - Conference Proceedings},
title = {An open architecture for multi-domain information extraction},
abstract = {This paper presents a multi-domain information extraction system. The overall architecture of the system is detailed. A set of machine learning tools helps the expert to explore the corpus and automatically derive knowledge from this corpus, Thus, the system allows the end-user to rapidly develop a local ontology giving an accurate image of the content of the text, so that the expert can elaborate new extraction templates. The system is finally evaluated using classical indicators.},
publisher = {American Association for Artificial Intelligence},
author = {Thierry Poibeau},
year = {2001},
keywords = {Extraction d'information},
pages = {81--86},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 02256983572 Information extraction {(IE){\textless}/p{\textgreater}}} },
-
J. Vivaldi and H. Rodriguez, "Improving term extraction by combining different techniques," Terminology, vol. 7, iss. 1, pp. 31-48, 2001.
@article{vivaldi_improving_2001, title = {Improving term extraction by combining different techniques},
volume = {7},
url = {http://www.ingentaconnect.com/content/jbp/term/2001/00000007/00000001/art00003},
abstract = {Two different reasons suggest that combining the performance of several term extractors could lead to an improvement in overall system accuracy. On the one hand, there is no clear agreement on whether to follow statistical, linguistic or hybrid approaches for (semi-) automatic term extraction. On the other hand, combining different knowledge sources (e.g. classifiers) has proved successful in improving the performance of individual sources on several {NLP} tasks (some of them closely related to or involved in term extraction), such as context-sensitive spelling correction, part-of-speech tagging, word sense disambiguation, parsing, text classification and filtering, etc. In this paper, we present a proposal for combining a number of different term extraction techniques in order to improve the accuracy of the resulting system. The approach has been applied to the domain of medicine for the Spanish language. A number of tests have been carried out with encouraging results.},
number = {1},
journal = {Terminology},
author = {J. Vivaldi and H. Rodriguez},
year = {2001},
keywords = {Extraction d'information},
pages = {31--48} },
-
M. Craven, D. DiPasquo, D. Freitag, A. McCallum, T. Mitchell, K. Nigam, and S. Slattery, "Learning to construct knowledge bases from the World Wide Web," Artificial Intelligence, vol. 118, iss. 1-2, pp. 69-113, 2000.
@article{craven_learning_2000, title = {Learning to construct knowledge bases from the World Wide Web},
volume = {118},
url = {http://www.sciencedirect.com/science/article/B6TYF-43FX0XK-3/2/8610b9d209e3e80a5ea6dfb53abdd711},
abstract = {The World Wide Web is a vast source of information accessible to computers, but understandable only to humans. The goal of the research described here is to automatically create a computer understandable knowledge base whose content mirrors that of the World Wide Web. Such a knowledge base would enable much more effective retrieval of Web information, and promote new uses of the Web to support knowledge-based inference and problem solving. Our approach is to develop a trainable information extraction system that takes two inputs. The first is an ontology that defines the classes (e.g., , , , ) and relations (e.g., , ) of interest when creating the knowledge base. The second is a set of training data consisting of labeled regions of hypertext that represent instances of these classes and relations. Given these inputs, the system learns to extract information from other pages and hyperlinks on the Web. This article describes our general approach, several machine learning algorithms for this task, and promising initial results with a prototype system that has created a knowledge base describing university people, courses, and research projects.},
number = {1-2},
journal = {Artificial Intelligence},
author = {Mark Craven and Dan {DiPasquo} and Dayne Freitag and Andrew {McCallum} and Tom Mitchell and Kamal Nigam and Sean Slattery},
year = {2000},
keywords = {Apprentissage machine, Classification, Extraction d'information, Ontologie, Web},
pages = {69--113} },
-
N. Turenne, "Apprentissage statistique pour l’extraction de concepts à partir de textes : application au filtrage d’informations textuelles," PhD Thesis , 2000.
@phdthesis{turenne_apprentissage_2000, type = {Thèse de doctorat, spécialité informatique},
title = {Apprentissage statistique pour l’extraction de concepts à partir de textes : application au filtrage d’informations textuelles},
abstract = {Cette thèse présente un modèle de construction automatique et approximatif de la représentation du sens d'un texte. Une structuration du domaine, couvert par des documents, est obtenue par une classification (en anglais « clustering ») faisant apparaître des thèmes sémantiques. Il faut améliorer les techniques en leur permettant de traiter les documents non indexés, en améliorant les résultats par une adaptation de connaissances linguistiques et une analyse des relations que marquent les cooccurrences entre termes. La quantité grandissante d'informations électroniques permet de constituer des échantillons de données variés et significatifs. Les techniques pour décrire les relations entre termes sont issues de méthodes mathématiques usuellement appliquées aux données structurées non textuelles. Le couplage de connaissances propres aux données avec une méthodologie adaptée aux données textuelles devrait apporter une amélioration des résultats. Nous tentons de justifier: d'une part l'utilisation de mécanismes linguistiques réduisant les biais d'une statistique descriptive des occurrences d'un terme, d'autre part l'utilisation d'une méthode basée sur les graphes dont les motifs permettraient de récupérer les relations conceptuelles entre termes. Dans un troisième temps nous facilitons l'interprétation de résultats émanant de traitements automatiques par la qualification consensuelle du thème représenté par une classe. L'interprétation de classes reste difficile, due aux multiples points de vue qu'un lecteur peut se faire des associations entre termes. Des classes de meilleure qualité facilitent l'interprétation, assistée par un thésaurus, que l'on peut attribuer à la structuration conceptuelle des termes d'un domaine. Le développement {d'Internet} renforce l'échange de documents électroniques entre les acteurs de différents sites. Le développement de systèmes logiciels d'échanges de documents appelés «workflow » dans les intranets d'entreprise augmente la fluidité des documents entre individus et entre services. Un système qui permet d'apprendre automatiquement des profils d'utilisateur et d'exploiter ces connaissances pour distribuer l'information semble incontournable. Nous essayons de caractériser un centre d'intérêt par des classes de termes.},
school = {Université Louis Pasteur, Strasbourg, France},
author = {Nicolas Turenne},
year = {2000},
keywords = {Extraction d'information, Intelligence artificielle, Recherche d'information},
annote = {{{\textless}p{\textgreater}turenneNicolas2000.pdf{\textless}/p{\textgreater}}} },
-
C. Jacquemin and D. Bourigault, Term extraction and automatic indexing, 2000.
@misc{jacquemin_term_2000, title = {Term extraction and automatic indexing},
url = {citeseer.ist.psu.edu/454369.html},
abstract = {This chapter presents a new domain of research and development in Natural Language Processing {(NLP)} that is concerned with the representation, acquisition, and recognition of terms. Terms are pervasive in scientific and technical documents; their identification is a crucial issue for any application dealing with the analysis, understanding, generation, or translation of such documents. In particular, the ever-growing mass of specialized documentation available on-line, in industrial and...},
author = {Christian Jacquemin and Didier Bourigault},
year = {2000},
keywords = {Extraction d'information, Indexation},
annote = {{{\textless}p{\textgreater}bourigaultDidier2003.pdf{\textless}/p{\textgreater}}} },
-
A. Bolioli, L. Dini, V. D. Tomaso, A. Goy, D. Sestero, N. Nicolov, and R. Mitkov, "MILK : a hybrid system for multilingual indexing and information extraction," in Recent advances in natural language processing II : selected papers from RANLP ‘97, Amsterdam, 2000, pp. 399-410.
@inproceedings{bolioli_milk_2000, address = {Amsterdam},
series = {Current issues in linguistic theory; 189},
title = {{MILK} : a hybrid system for multilingual indexing and information extraction},
isbn = {1-55619-966-x},
booktitle = {Recent advances in natural language processing {II} : selected papers from {RANLP} '97},
publisher = {John Benjamins},
author = {Andrea Bolioli and Luca Dini and Vittorio Di Tomaso and Anna Goy and D. Sestero and Nicolas Nicolov and Ruslan Mitkov},
year = {2000},
keywords = {Extraction d'information, Linguistique},
pages = {399--410} },
-
C. Heng-Hsou, K. Yau-Hwang, and H. Jang-Pong, "An event-driven and ontology-based approach for the delivery and information extraction of e-mails," , Taipei, Taiwan, 2000, pp. 103-9.
@inproceedings{heng-hsou_event-driven_2000, address = {Taipei, Taiwan},
series = {Proceedings International Symposium on Multimedia Software Engineering},
title = {An event-driven and ontology-based approach for the delivery and information extraction of e-mails},
url = {http://dx.doi.org/10.1109/MMSE.2000.897199},
abstract = {In the field of information extraction {(IE),} the extraction of information from documents is usually event-oriented. Therefore, many information extraction machines have built their domain knowledge based on events. However, information extraction is often limited in its application in specific domains, because the events are simply detected by predefined keywords. We propose event detection driven intelligent information extraction by using the neural network paradigm. In this paper, the backpropagation {(BP)} learning algorithm is adopted to train the event detector. In order to detect the potential events in documents effectively, we apply natural language processing technology to aid the selection of nouns as feature words. Unrelated nouns are filtered by the analysis based on document frequency distribution. Finally, selected nouns are conceptualized into concepts. These concepts are supposed to characterize documents appropriately and they are stored in ontology as a knowledge base. In the experimental results, we achieved high accuracy both in the inside testing and outside testing of Internet documents. By means of the well-trained event detector, the information extraction task can be certainly applied in wider domains. Eventually, this event detection technology is introduced for the delivery and information extraction of e-mail},
publisher = {{IEEE} Comput. Soc},
author = {Chang {Heng-Hsou} and Ko {Yau-Hwang} and Hsu {Jang-Pong}},
year = {2000},
note = {Copyright 2001, {IEE}},
keywords = {Extraction d'information, Ontologie},
pages = {103--9},
annote = {{\textless}p{\textgreater}6806550 event-driven approach ontology-based approach intelligent information extraction e-mail neural network backpropagation learning natural language processing document frequency distribution experiment Internet{\textless}/p{\textgreater}} },
-
S. Wermter, "Knowledge extraction from transducer neural networks," Applied Intelligence, vol. 12, iss. 1-2, pp. 27-42, 2000.
@article{wermter_knowledge_2000, title = {Knowledge extraction from transducer neural networks},
volume = {12},
url = {http://www.ingentaconnect.com/content/klu/apin/2000/00000012/00000001/00243690},
abstract = {Previously neural networks have shown interesting performance results for tasks such as classification, but they still suffer from an insufficient focus on the structure of the knowledge represented therein. In this paper, we analyze various knowledge extraction techniques in detail and we develop new transducer extraction techniques for the interpretation of recurrent neural network learning. First, we provide an overview of different possibilities to express structured knowledge using neural networks. Then, we analyze a type of recurrent network rigorously, applying a broad range of different techniques. We argue that analysis techniques, such as weight analysis using Hinton diagrams, hierarchical cluster analysis, and principal component analysis may be useful for providing certain views on the underlying knowledge. However, we demonstrate that these techniques are too static and too low-level for interpreting recurrent network classifications. The contribution of this paper is a particularly broad analysis of knowledge extraction techniques. Furthermore, we propose dynamic learning analysis and transducer extraction as two new dynamic interpretation techniques. Dynamic learning analysis provides a better understanding of how the network learns, while transducer extraction provides a better understanding of what the network represents.},
number = {1-2},
journal = {Applied Intelligence},
author = {S. Wermter},
year = {2000},
keywords = {Extraction d'information, Réseau de neurones},
pages = {27--42} },
-
M. Bernard, Introduction aux études littéraires assistées par ordinateur, Paris: Presses universitaires de France, 1999.
@book{bernard_introduction_1999, address = {Paris},
series = {Écritures électroniques},
title = {Introduction aux études littéraires assistées par ordinateur},
publisher = {Presses universitaires de France},
author = {Michel Bernard},
year = {1999},
keywords = {Extraction d'information, Fouille de texte, Informatique} },
-
M. Lens and A. Glintschert, "On texts, cases, and concepts," in XPS-99 : knowledge-based systems, survey and future directions : Wrzburg, Allemagne, 3-5 March 1999, 1999, pp. 148-155.
@inproceedings{lens_texts_1999, series = {Lecture notes in computer science; 1570},
title = {On texts, cases, and concepts},
isbn = {3-540-65658-8},
abstract = {The management of textual information is getting more and more attention within the case-based reasoning community. In this paper, we will address the question of how a case base can be obtained from a given textual description and how this representation scheme can be enriched by higher level concepts.},
booktitle = {{XPS-99} : knowledge-based systems, survey and future directions : Wrzburg, Allemagne, 3-5 March 1999},
publisher = {Springer},
author = {Mario Lens and Alexander Glintschert},
year = {1999},
keywords = {Extraction d'information},
pages = {148--155},
annote = {{{\textless}p{\textgreater}lenzMario1999.pdf{\textless}/p{\textgreater}}} },
-
K. Gabi, Extraction dynamique de connaissances à partir de textes par réseaux neuronauxInstitut national polytechnique de Grenoble, 1997.
-
D. Bouchaffra and J. G. Meunier, A thematic knowledge extraction in text using a markovian random field approach, 1995.
@misc{bouchaffra_thematic_1995, title = {A thematic knowledge extraction in text using a markovian random field approach},
abstract = {We present a Markovian Random Field modeling for thematic knowledge extraction in text. An analogy is made between a flow of thematic investigations/textual fragments matching and statistical mechanics systems. The Markovian Field Knowledge Extraction machine {(MAFKE)} that we propose is based on a dynamical interaction between thematic queries and fragments composing a text. The representation of the textual knowledge system is submitted to state variations emerging from the flow of thematic queries. The {MAFKE} machine tries to satisfy the user thematic queries by changing the set of Units of Information {(UNIFs)} contained in a fragment. This change is computed with respect to the input thematic queries. Hence, {MAFKE} machine transists from one configuration state to another by changing the threshold assigned to the pertinency of {UNIFs.} For each state, a certain degradation of the system which depends on the thematic query index and this threshold is considered. The equivalence concept between an {MRF} and the Gibbs distribution {(Max} entropy) enables us to consider the energy and potential functions of this physical system. We use simulated annealing algorithm to isolate low energy states: this corresponds to the best (in some sense) knowledge extraction from the text that satisfies the user investigation. During the evolution towards these lower energy states, a fragment classifier emerges: the Markovian Random Field machine behaves as a classifier.},
author = {Djamel Bouchaffra and Jean Guy Meunier},
year = {1995},
keywords = {Extraction d'information},
annote = {{{\textless}p{\textgreater}bouchaffraDjamel1995.pdf{\textless}/p{\textgreater}}} },
-
D. Bouchaffra and Jean-Guy, "A thematic knowledge extraction modeling through a markovian random field approach," , 1995.
@article{bouchaffra_thematic_1995-1, series = {6th international {DEXA} 95 conference and workshop on database and expert systems applications},
title = {A thematic knowledge extraction modeling through a markovian random field approach},
author = {Djamel Bouchaffra and {Jean-Guy} Meunier},
year = {1995},
keywords = {Extraction d'information} },
-
E. Riloff and W. Lehnert, "Information extraction as a basis for high-precision text classification," ACM transactions on information systems, vol. 12, iss. 3, pp. 296-333, 1994.
@article{riloff_information_1994, title = {Information extraction as a basis for high-precision text classification},
volume = {12},
issn = {1046-8188},
doi = {10.1145/183422.183428},
abstract = {We describe an approach to text classification that represents a compromise between traditional word-based techniques and in-depth natural language processing. Our approach uses a natural language processing task called “information extraction” as a basis for high-precision text classification. We present three algorithms that use varying amounts of extracted information to classify texts. The relevancy signatures algorithm uses linguistic phrases; the augmented relevancy signatures algorithm uses phrases and local context; and the case-based text classification algorithm uses larger pieces of context. Relevant phrases and contexts are acquired automatically using a training corpus. We evaluate the algorithms on the basis of two test sets from the {MUC-4} corpus. All three algorithms achieved high precision on both test sets, with the augmented relevancy signatures algorithm and the case-based algorithm reaching 100\% precision with over 60\% recall on one set. Additionally, we compare the algorithms on a larger collection of 1700 texts and describe an automated method for empirically deriving appropriate threshold values. The results suggest that information extraction techniques can support high-precision text classification and, in general, that using more extracted information improves performance. As a practical matter, we also explain how the text classification system can be easily ported across domains.},
number = {3},
journal = {{ACM} transactions on information systems},
author = {Ellen Riloff and Wendy Lehnert},
year = {1994},
keywords = {Extraction d'information},
pages = {296--333},
annote = {{{\textless}p{\textgreater}riloffEllen1994.pdf{\textless}/p{\textgreater}}} },
-
G. Salton, J. Allan, and C. Buckley, "Automatic structuring and retrieval of large text files," Communications of the ACM, vol. 37, iss. 2, pp. 97-108, 1994.
@article{salton_automatic_1994, title = {Automatic structuring and retrieval of large text files},
volume = {37},
number = {2},
journal = {Communications of the {ACM}},
author = {Gerald Salton and James Allan and Chris Buckley},
year = {1994},
keywords = {Extraction d'information, Fouille de texte, Recherche d'information},
pages = {97--108} },
-
G. G. Yen and Z. Wu, "Ranked centroid projection : a data visualization approach with self-organizing maps," IEEE Transactions on Neural Networks, vol. 19, iss. 2, pp. 245-259, 2008.
@article{yen_ranked_2008, title = {Ranked centroid projection : a data visualization approach with self-organizing maps},
volume = {19},
issn = {10459227},
shorttitle = {Ranked Centroid Projection},
doi = {10.1109/TNN.2007.905858},
abstract = {The self-organizing map {(SOM)} is an efficient tool for visualizing high-dimensional data. In this paper, the clustering and visualization capabilities of the {SOM,} especially in the analysis of textual data, i.e., document collections, are reviewed and further developed. A novel clustering and visualization approach based on the {SOM} is proposed for the task of text mining. The proposed approach first transforms the document space into a multidimensional vector space by means of document encoding. Afterwards, a growing hierarchical {SOM} {(GHSOM)} is trained and used as a baseline structure to automatically produce maps with various levels of detail. Following the {GHSOM} training, the new projection method, namely the ranked centroid projection {(RCP),} is applied to project the input vectors to a hierarchy of {2-D} output maps. The {RCP} is used as a data analysis tool as well as a direct interface to the data. In a set of simulations, the proposed approach is applied to an illustrative data set and two real-world scientific document collections to demonstrate its applicability. {ABSTRACT} {FROM} {AUTHOR} Copyright of {IEEE} Transactions on Neural Networks is the property of {IEEE} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {2},
journal = {{IEEE} Transactions on Neural Networks},
author = {Gary G. Yen and Zheng Wu},
month = feb, year = {2008},
keywords = {Fouille de donnée, Fouille de texte, Réseau de neurones, Visualisation de l'information},
pages = {245--259},
annote = {{{\textless}p{\textgreater}Accession} Number: 31171851; Yen, Gary G. 1; Email Address: gyen@okstate.edu; Zheng Wu 1; Affiliations: 1: School of Electrical and Computer Engineering, Oklahoma State University, Stillwater, {OK} 74078 {USA;} Issue Info: Feb2008, Vol. 19 Issue 2, p245; Thesaurus Term: {NEURAL} networks {(Computer} science); Thesaurus Term: {VISUAL} programming languages {(Computer} science); Thesaurus Term: {DATA} mining; Subject Term: {SELF-organizing} maps; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {CONTENT} mining; Subject Term: {SELF-organizing} systems; Subject Term: {VECTOR} analysis; Subject Term: {ENCODING;} {Author-Supplied} Keyword: Data visualization; {Author-Supplied} Keyword: document clustering; {Author-Supplied} Keyword: self-organizing map {(SOM);} Number of Pages: 15p; Illustrations: 3 charts, 6 diagrams, 15 graphs, 2 bw; Document Type: Article{\textless}/p{\textgreater}} },
-
A. Gonsalves, "Clarabridge improves text-analytics platform," Intelligent Enterprise, vol. 11, iss. 6, p. 6, 2008.
@article{gonsalves_clarabridge_2008, title = {Clarabridge improves text-analytics platform},
volume = {11},
issn = {15243621},
doi = {Article},
abstract = {The article reports on the upgrade of Clarabridge's text-analytics platform Content Mining Platform. It states that Release 3.0 of the Content Mining Platform includes Navigator, a drag-and-drop interface and its extraction engine has finer domain-specific tuning. It also provides the information of the enhancements done as well as the technology used in the upgrade of Release 3.0. Furthermore, it stresses that this version of the platform also offers collaboration tools for business analysts in different departments. An overview of the Gaylord Hotels, which is one of the customers of Clarabridge, is also presented.},
number = {6},
journal = {Intelligent Enterprise},
author = {Antone Gonsalves},
year = {2008},
keywords = {Catalogage, Fouille de donnée, Fouille de texte, Recherche d'information},
pages = {6},
annote = {{{\textless}p{\textgreater}Accession} Number: 33125974; Gonsalves, Antone; Issue Info: Jun2008, Vol. 11 Issue 6, p6; Thesaurus Term: {DATA} mining; Thesaurus Term: {DATABASE} searching; Thesaurus Term: {ONLINE} data processing; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {COMPUTER} programs; Subject Term: {SOFTWARE;} Subject Term: {CATALOGING} -- Analytical entry; Subject Term: {CONTENT} mining; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {AUTOMATIC} extracting {(Information} science) ; {Company/Entity:} {CLARABRIDGE} {(Company);} Number of Pages: 1p; Document Type: Article{\textless}/p{\textgreater}} },
-
I-Hsien, "Web-mining applications in e-commerce and e-services," Online Information Review, vol. 32, iss. 2, pp. 129-132, 2008.
@article{i-hsien_ting_web-mining_2008, title = {Web-mining applications in e-commerce and e-services},
volume = {32},
issn = {1468-4527},
url = {http://www.emeraldinsight.com/10.1108/14684520810879773},
abstract = {Purpose – The purpose of this guest editorial is to introduce the papers in this special issue. Design/methodology/approach – A brief introduction about the issue of web-mining applications in e-commerce and e-services is provided, along with a summary of the main contributions of the papers that are included in the special issue. Findings – The value of web-mining techniques can be enhanced through applying them to real environments such as e-commerce and e-services. The research fields of web mining, e-commerce and e-services can also be expanded. Originality/value – An overview of the special issue and related research is provided in this paper.},
number = {2},
journal = {Online Information Review},
author = {{I-Hsien} Ting},
year = {2008},
keywords = {Fouille de donnée, Web},
pages = {129 -- 132} },
-
R. Gu, M. Zhu, L. Zhao, and N. Zhang, "Interest mining in virtual learning environments," Online Information Review, vol. 32, iss. 2, p. 133-146, 2008.
@article{rong_gu_interest_2008, title = {Interest mining in virtual learning environments},
volume = {32},
issn = {1468-4527},
url = {http://www.emeraldinsight.com/10.1108/14684520810879782},
abstract = {Purpose – Behaviour in virtual learning environments {(VLE),} including travel, gaze, manipulate, gesture and conversation, offer considerable information about the user's implicit interest. The purpose of this study is to find an approach for user interest mining via behaviour analysis in a {VLE.} Design/methodology/approach – According to research in psychology, any interaction in a {VLE} has implications for the user's implicit interest. In order to mine a user's implicit interest, an explicit interaction-interest model needs to be established. This paper presents findings from the concept classification of behaviour in a {VLE.} Based on this classification, the paper proposes a hierarchical interaction model. In this model the relation between interaction and user interest can be described and used to improve system performance. Findings – In the experimental prototype the authors found that user-implicit interest could be mined via stages of web mining, i.e. capture the user's original gesture signal, data pre-process, pattern discovery, interaction goal and interest mining. The mined user's interest information can be used to update the state of local interest, leading to a reduction in network traffic and promotion of better system performance. Originality/value – This is an original study using behaviour analysis for interest mining in e-learning. Research on interest mining in e-learning focused on content mining or search engine and usage mining in web courses. The paper provides valuable clues regarding user interest mining in a {VLE,} in which the context is different from usual web courses. The research output can be implemented widely, including online learning, and especially in the {VLE.}},
number = {2},
journal = {Online Information Review},
author = {Rong Gu and Miaoliang Zhu and Liying Zhao and Ningning Zhang},
year = {2008},
keywords = {Document numérique, Fouille de donnée},
pages = {133 -- 146}
-
M. Abulaish and L. Dey, "Biological relation extraction and query answering from MEDLINE abstracts using ontology-based text mining," Data and Knowledge Engineering, vol. 61, iss. 2, pp. 228-262, 2007.
@article{abulaish_biological_2007, title = {Biological relation extraction and query answering from {MEDLINE} abstracts using ontology-based text mining},
volume = {61},
abstract = {The rapid growth of the biological text data repository makes it difficult for human beings to access required information in a convenient and effective manner. The problem arises due to the fact that most of the information is embedded within unstructured or semi-structured text that computers cannot interpret very easily. In this paper we have presented an ontology-based Biological Information Extraction and Query Answering {(BIEQA)} System, which initiates text mining with a set of concepts stored in a biological ontology, and thereafter mines possible biological relations among those concepts using {NLP} techniques and co-occurrence-based analysis. The system extracts all frequently occurring biological relations among a pair of biological concepts through text mining. A mined relation is associated to a fuzzy membership value, which is proportional to its frequency of occurrence in the corpus and is termed a fuzzy biological relation. The fuzzy biological relations extracted from a text corpus along with other relevant information components like biological entities occurring within a relation, are stored in a database. The database is integrated with a query-processing module. The query-processing module has an interface, which guides users to formulate biological queries at different levels of specificity. © 2006 Elsevier {B.V.} All rights reserved.},
number = {2},
journal = {Data and Knowledge Engineering},
author = {Muhammad Abulaish and Lipika Dey},
year = {2007},
keywords = {Fouille de donnée, Ontologie},
pages = {228--262},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 071210496835 {0169-023X} Biological Information Extraction and Query Answering {(BIEQA)} Systems Text mining Biological relation extraction Biological query processing{\textless}/p{\textgreater}} },
-
B. R. Lewis and S. M. Maas, "QDA Miner 2.0 : mixed-model qualitative data analysis software," Field Methods, vol. 19, iss. 1, pp. 87-108, 2007.
@article{lewis_qda_2007, title = {{QDA} Miner 2.0 : mixed-model qualitative data analysis software},
volume = {19},
shorttitle = {{QDA} Miner 2.0},
url = {http://fmx.sagepub.com},
doi = {10.1177/1525822X06296589},
number = {1},
journal = {Field Methods},
author = {R. Barry Lewis and Steven M. Maas},
month = feb, year = {2007},
keywords = {Fouille de donnée},
pages = {87--108},
annote = {{{\textless}p{\textgreater}lewisBarry2007.pdf{\textless}/p{\textgreater}}} },
-
R. Sanderson and P. Watry, "Integrating data and text mining processes for digital library applications," , Vancouver, BC, Canada, 2007, pp. 73-79.
@inproceedings{sanderson_integrating_2007, address = {Vancouver, {BC,} Canada},
title = {Integrating data and text mining processes for digital library applications},
isbn = {978-1-59593-644-8},
url = {http://portal.acm.org/ft_gateway.cfm?id=1255188&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1255175.1255188},
abstract = {This paper explores the integration of text mining and data mining techniques, digital library systems, and computational and data grid technologies with the objective of developing an online classification service exemplar. We discuss the current research issues relating to the use of data mining algorithms and toolkits for textual data; the necessary changes within the Cheshire3 Information Framework to accommodate analysis workflows; the outcomes of a demonstrator based on the National Library of Medicine's Medline dataset; and the provision of comparable metrics for evaluation purposes. The prototype has resulted in extremely accurate online classification services and offers a novel method of supporting text mining and data mining within a highly scaled computational environment, integrated seamlessly into the digital library architecture.},
publisher = {{ACM}},
author = {Robert Sanderson and Paul Watry},
year = {2007},
keywords = {Bibliothèque numérique, Fouille de donnée, Fouille de texte},
pages = {73--79},
annote = {{{\textless}p{\textgreater}sandersonRobert2007.pdf{\textless}/p{\textgreater}}} },
-
C. C. Trumbach and D. Payne, "Identifying synonymous concepts in preparation for technology mining," Journal of Information Science, vol. 33, iss. 6, pp. 660-677, 2007.
@article{trumbach_identifying_2007, title = {Identifying synonymous concepts in preparation for technology mining},
volume = {33},
issn = {01655515},
doi = {10.1177/0165551506076401},
abstract = {In this research, the development of a 'concept-clumping algorithm' designed to improve the clustering of technical concepts is demonstrated. The algorithm developed first identifies a list of technically relevant noun phrases from a cleaned extracted list and then applies a rule-based algorithm for identifying synonymous terms based on shared words in each term. An assessment of the algorithm found that the algorithm has an 89-91\% precision rate, was successful in moving technically important terms higher in the term frequency list, and improved the technical specificity of term clusters. {ABSTRACT} {FROM} {AUTHOR} Copyright of Journal of Information Science is the property of Sage Publications, Ltd. and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {6},
journal = {Journal of Information Science},
author = {Cherie Courseault Trumbach and Dinah Payne},
month = dec, year = {2007},
keywords = {Fouille de donnée, Fouille de texte},
pages = {660--677},
annote = {{{\textless}p{\textgreater}Accession} Number: 28141770; Trumbach, Cherie Courseault 1; Email Address: ctrumbac@uno.edu; Payne, Dinah 1; Affiliations: 1: Department of Management, University of New Orleans, New Orleans, {USA;} Issue Info: 2007, Vol. 33 Issue 6, p660; Thesaurus Term: {DATA} mining; Thesaurus Term: {DATABASE} searching; Thesaurus Term: {INFORMATION} resources; Thesaurus Term: {INFORMATION} science; Thesaurus Term: {DATABASES;} Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {BIBLIOMETRICS;} Subject Term: {STATISTICAL} methods; Subject Term: {COMPUTER} files; {Author-Supplied} Keyword: data quality; {Author-Supplied} Keyword: knowledge discovery; {Author-Supplied} Keyword: term similarity; {Author-Supplied} Keyword: text cleaning; {Author-Supplied} Keyword: text mining; Number of Pages: 18p; Illustrations: 10 charts, 1 diagram; Document Type: Article{\textless}/p{\textgreater}} },
-
B. Liu, Web data mining : exploring hyperlinks, contents, and usage data, Berlin: Springer, 2007.
@book{liu_web_2007, address = {Berlin},
series = {Data-centric systems and applications},
title = {Web data mining : exploring hyperlinks, contents, and usage data},
isbn = {9783540378815},
publisher = {Springer},
author = {Bing Liu},
year = {2007},
keywords = {Fouille de donnée, Web} },
-
R. Feldman and J. Sanger, The text mining handbook : advanced approaches in analyzing unstructured data, Cambridge ; New York: Cambridge University Press, 2007.
@book{feldman_text_2007, address = {Cambridge ; New York},
title = {The text mining handbook : advanced approaches in analyzing unstructured data},
isbn = {0521836573 {(HARDBACK)} 9780521836579 {(HARDBACK)}},
url = {http://library.books24x7.com/toc.asp?bookid=23164},
publisher = {Cambridge University Press},
author = {Ronen Feldman and James Sanger},
year = {2007},
keywords = {Fouille de texte},
annote = {{{\textless}p{\textgreater}Accessible} en ligne via Books24x7 (http://library.books24x7.com/toc.asp?bookid=23164){\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}TOC} : Introduction to text mining -- Core text mining operations -- Text mining preprocessing techniques -- Categorization -- Clustering -- Information extraction -- Probabilistic models for information extraction -- Preprocessing applications using probabilistic and hybrid approaches -- Presentation-layer considerations for browsing and query refinement -- Visualization approaches -- Link analysis -- Text mining applications.{\textless}/p{\textgreater}} },
-
W. Graco, T. Semenova, and E. Dubossarsky, "Toward knowledge-driven data mining," , San Jose, California, 2007, pp. 49-54.
@inproceedings{graco_toward_2007, address = {San Jose, California},
title = {Toward knowledge-driven data mining},
isbn = {978-1-59593-846-6},
url = {http://portal.acm.org/ft_gateway.cfm?id=1288559&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=34831909},
doi = {10.1145/1288552.1288559},
abstract = {This paper highlights the need to move from a method-driven approach to a knowledge-driven approach to data mining. A number of issues are covered including the need to develop 'smart' data-mining algorithms which include expert mining and modelling knowledge, the need to use 'intelligent data' or data that contains both metadata and metaknowledge, the need to marry business knowledge with technical knowledge with data mining and the need to use intelligence and other qualitative analyses to determine where data-mining efforts should be focused.},
publisher = {{ACM}},
author = {Warwick Graco and Tatiana Semenova and Eugene Dubossarsky},
year = {2007},
keywords = {Fouille de donnée},
pages = {49--54},
annote = {{{\textless}p{\textgreater}gracoWarwick2007.pdf{\textless}/p{\textgreater}}} },
-
D. J. Han and Kamber, Data mining : concepts and techniques, , 2006.
@book{han_data_2006, title = {Data mining : concepts and techniques},
isbn = {1558609016},
url = {http://books.google.ca/books?id=AfL0t-YzOrEC&hl=fr},
abstract = {Our ability to generate and collect data has been increasing rapidly. Not only are all of our business, scientific, and government transactions now computerized, but the widespread use of digital cameras, publication tools, and bar codes also generate data. On the collection side, scanned text and image platforms, satellite remote sensing systems, and the World Wide Web have flooded us with a tremendous amount of data. This explosive growth has generated an even more urgent need for new techniques and automated tools that can help us transform this data into useful information and knowledge. Like the first edition, which was voted the most popular data mining book by {KD} Nuggets readers, this book explores the concepts and techniques for the discovery of patterns hidden in large data sets, focusing on issues relating to their feasibility, usefulness, effectiveness, and scalability. However, since the publication of the first edition, great progress has been made in the development of new data mining methods, systems, and applications. This new edition substantially enhances the first edition, and new chapters have been added to address recent developments on mining complex types of data including stream data, sequence data, graph structured data, social network data, and multi-relational data.},
author = {De Jiawei Han and Kamber},
year = {2006},
keywords = {Fouille de donnée} },
-
C. Baker and R. Witte, "Mutation Mining A Prospector’s Tale," Information Systems Frontiers, vol. 8, iss. 1, pp. 47-57, 2006.
@article{baker_mutation_2006, title = {Mutation Mining A Prospector's Tale},
volume = {8},
abstract = {Protein structure visualization tools render images that allow the user to explore structural features of a protein. Context specific information relating to a particular protein or protein family is, however, not easily integrated and must be uploaded from databases or provided through manual curation of input files. Protein Engineers spend considerable time iteratively reviewing both literature and protein structure visualizations manually annotated with mutated residues. Meanwhile, text mining tools are increasingly used to extract specific units of raw text from scientific literature and have demonstrated the potential to support the activities of Protein Engineers. The transfer of mutation specific raw-text annotations to protein structures requires integrated data processing pipelines that can co-ordinate information retrieval, information extraction, protein sequence retrieval, sequence alignment and mutant residue mapping. We describe the Mutation Miner pipeline designed for this purpose and present case study evaluations of the key steps in the process. Starting with literature about mutations made to protein families; haloalkane dehalogenase, bi-phenyl dioxygenase, and xylanase we enumerate relevant documents available for text mining analysis, the available electronic formats, and the number of mutations made to a given protein family. We review the efficiency of {NLP} driven protein sequence retrieval from databases and report on the effectiveness of Mutation Miner in mapping annotations to protein structure visualizations. We highlight the feasibility and practicability of the approach.},
number = {1},
journal = {Information Systems Frontiers},
author = {Christopher Baker and Rene Witte},
year = {2006},
keywords = {Fouille de donnée, Fouille de texte},
pages = {47--57} },
-
A. Culotta, A. McCallum, and J. Betz, "Integrating probabilistic extraction models and data mining to discover relations and patterns in text," , New York, New York, 2006, pp. 296-303.
@inproceedings{culotta_integrating_2006, address = {New York, New York},
title = {Integrating probabilistic extraction models and data mining to discover relations and patterns in text},
url = {http://portal.acm.org/ft_gateway.cfm?id=1220873&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {In order for relation extraction systems to obtain human-level performance, they must be able to incorporate relational patterns inherent in the data (for example, that one's sister is likely one's mother's daughter, or that children are likely to attend the same college as their parents). Hand-coding such knowledge can be time-consuming and inadequate. Additionally, there may exist many interesting, unknown relational patterns that both improve extraction performance and provide insight into text. We describe a probabilistic extraction model that provides mutual benefits to both "top-down" relational pattern discovery and "bottom-up" relation extraction.},
publisher = {Association for Computational Linguistics},
author = {Aron Culotta and Andrew {McCallum} and Jonathan Betz},
year = {2006},
keywords = {Approche probabiliste, Fouille de donnée, Fouille de texte},
pages = {296--303},
annote = {{{\textless}p{\textgreater}culottaAron2006.pdf{\textless}/p{\textgreater}}} },
-
M. Grobelnik, J. Brank, D. Mladenic, B. Novak, and B. Fortuna, "Using DMoz for constructing ontology from data stream," , Cavtat/Dubrovnik, Croatia, 2006, p. 6.
@inproceedings{grobelnik_using_2006, address = {{Cavtat/Dubrovnik,} Croatia},
series = {{ITI} 2006. Proceedings of the 28th International Conference on Information Technology Interfaces},
title = {Using {DMoz} for constructing ontology from data stream},
abstract = {This paper presents an approach for constructing an ontology from a stream of documents. Named entities extracted from the documents are used as instances of the ontology. Entities and co-occurring entity pairs are represented by feature vectors based on the content of the documents where they occurred. In general, concepts and relations can be formed into an ontological structure either by clustering or by classification into an existing topic hierarchy. We propose the latter using {DMoz} as an existing topic hierarchy. The approach is efficient and can scale to large data sets. We propose a framework that incorporates the stream mining process into a formal definition of the ontology. We describe a software component implementing this approach, and present experiments using a large collection of news},
publisher = {{IEEE}},
author = {M. Grobelnik and J. Brank and D. Mladenic and B. Novak and B. Fortuna},
year = {2006},
note = {Copyright 2006, The Institution of Engineering and Technology},
keywords = {Fouille de donnée, Ontologie, Recherche d'information, Web},
pages = {6 pp.},
annote = {{\textless}p{\textgreater}9189429 {DMoz} data stream mining process named entity extraction feature vectors ontological structure data clustering data classification topic hierarchy software component machine learning Web directory{\textless}/p{\textgreater}} },
-
W. M. Pottenger, S. Li, and C. D. Janneck, "Distributed higher-order text mining : theory and practice," , San Diego, California, 2006, pp. 446-447.
@inproceedings{pottenger_distributed_2006, address = {San Diego, California},
title = {Distributed higher-order text mining : theory and practice},
shorttitle = {Distributed higher-order text mining},
url = {http://portal.acm.org/ft_gateway.cfm?id=1146742&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1146598.1146742},
abstract = {This highlight discusses the current and ongoing research into distributed higher-order text mining, as implemented using the {DiHO} {ARM} algorithm in the {D-HOTM} system. The {DiHO} {ARM} algorithm performs association rule mining in the absence of full knowledge of a global schema on distributed data that is neither vertically nor horizontally fragmented. The {D-HOTM} system encapsulates the {DiHO} (and potentially any other) rule-mining algorithm in a distributed system, designed as an extensible digital toolset for data analysts in law enforcement, counterterrorism, health care and other application domains.},
publisher = {{ACM}},
author = {William M. Pottenger and Shenzhi Li and Christopher D. Janneck},
year = {2006},
keywords = {Analyse de texte, Fouille de donnée, Fouille de texte},
pages = {446--447},
annote = {{{\textless}p{\textgreater}pottengerWilliam2006.pdf{\textless}/p{\textgreater}}} },
-
Y. Li and N. Zhong, "Rough association rule mining in text documents for acquiring Web user information needs." 2006, pp. 226-232.
@inproceedings{li_rough_2006, title = {Rough association rule mining in text documents for acquiring Web user information needs},
isbn = {0-7695-2747-7},
url = {http://portal.acm.org/ft_gateway.cfm?id=1249099&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {It is a big challenge to apply data mining techniques for effective Web information gathering because of duplications and ambiguities of data values (e.g., terms). To provide an effective solution to this challenge, this paper first explains the relationship between association rules and rough set based decision rules. It proves that a decision pattern is a kind of closed pattern. It also presents a novel concept of rough association rules in order to improve the effectiveness of association rule mining. The premise of a rough association rule consists of a set of terms and a frequency distribution of terms. The distinct advantage of rough association rules is that they contain more specific information than normal association rules. It is also feasible to update rough association rules dynamically to produce effective results.},
publisher = {{IEEE} Computer Society},
author = {Yuefeng Li and Ning Zhong},
year = {2006},
keywords = {Fouille de donnée, Web},
pages = {226--232},
annote = {{{\textless}p{\textgreater}liYuefeng2006.pdf{\textless}/p{\textgreater}}} },
-
A. Christy and P. Thambidurai, "Efficient information extraction using machine learning and classification using genetic and C4.8 algorithms," Information Technology Journal, vol. 5, iss. 6, pp. 1023-7, 2006.
@article{christy_efficient_2006, title = {Efficient information extraction using machine learning and classification using genetic and C4.8 algorithms},
volume = {5},
abstract = {With the amount of information available on the Internet growing at phenomenal rate, research in improving the effectiveness and efficiency of information extraction and knowledge discovery has become crucial. Text mining is one of the most important ways of extracting meaningful information from a large collection of text documents, leaving aside the information which is not useful to the ordinary user. In this study, we propose a method for automatically extracting key elements from a collection of text documents by extracting a set of features using a machine learning technique. We have used the genetic algorithms for classifying the features those are selected by the machine learning technique. We also compared the results produced by the genetic algorithm with 10 folds cross-validation at C4.8, Rain Forest, Raintree and {NB} tree methods and we have found C4.8 has produced better precision and recall and also the genetic algorithm is an effective classifier and is quite competitive even though the concept increases in complexity},
number = {6},
journal = {Information Technology Journal},
author = {A. Christy and P. Thambidurai},
year = {2006},
keywords = {Apprentissage machine, Fouille de donnée},
pages = {1023--7},
annote = {{{\textless}p{\textgreater}Copyright} 2006, The Institution of Engineering and Technology 9193959 information extraction machine learning genetic algorithm C4.8 algorithm Internet knowledge discovery text mining Rain Forest Raintree {NB} tree parsing feature set extraction{\textless}/p{\textgreater}} },
-
J. Xing and T. Ah-Hwee, "Mining ontological knowledge from domain-specific text documents," , Houston, TX, USA, 2006, p. 4.
@inproceedings{xing_mining_2006, address = {Houston, {TX,} {USA}},
series = {Proceedings. Fifth {IEEE} International Conference on Data Mining},
title = {Mining ontological knowledge from domain-specific text documents},
abstract = {Traditional text mining systems employ shallow parsing techniques and focus on concept extraction and taxonomic relation extraction. This paper presents a novel system called {CRCTOL} for mining rich semantic knowledge in the form of ontology from domain-specific text documents. By using a full text parsing technique and incorporating both statistical and lexico-syntactic methods, the knowledge extracted by our system is more concise and contains a richer semantics compared with alternative systems. We conduct a case study wherein {CRCTOL} extracts ontological knowledge, specifically key concepts and semantic relations, from a terrorism domain text collection. Quantitative evaluation, by comparing with a state-of-the-art ontology learning system known as text-to-onto, has shown that {CRCTOL} produces much better precision and recall for both concept and relation extraction, especially from sentences with complex structures},
publisher = {{IEEE} Computer Society},
author = {Jiang Xing and Tan {Ah-Hwee}},
year = {2006},
note = {Copyright 2006, The Institution of Engineering and Technology},
keywords = {Analyse de texte, Approche statistique, Fouille de donnée, Ontologie},
pages = {4 pp.},
annote = {{\textless}p{\textgreater}8857416 ontological knowledge mining domain-specific text document text mining full text parsing statistical method lexico-syntactic method concept extraction relation extraction concept relation concept tuple ontology learning{\textless}/p{\textgreater}} },
-
Chan-Chine and Ruey-Shun, "Using data mining technology to solve classification problems : a case study of campus digital library," The Electronic Library, vol. 24, iss. 3, pp. 307-321, 2006.
@article{chan-chine_chang_using_2006, title = {Using data mining technology to solve classification problems : a case study of campus digital library},
volume = {24},
issn = {0264-0473},
url = {http://www.emeraldinsight.com/10.1108/02640470610671178},
abstract = {Purpose – Traditional library catalogs have become inefficient and inconvenient in assisting library users. Readers may spend a lot of time searching library materials via printed catalogs. Readers need an intelligent and innovative solution to overcome this problem. The paper seeks to examine data mining technology which is a good approach to fulfill readers' requirements. Design/methodology/approach – Data mining is considered to be the non-trivial extraction of implicit, previously unknown, and potentially useful information from data. This paper analyzes readers' borrowing records using the techniques of data analysis, building a data warehouse, and data mining. Findings – The paper finds that after mining data, readers can be classified into different groups according to the publications in which they are interested. Some people on the campus also have a greater preference for multimedia data. Originality/value – The data mining results shows that all readers can be categorized into five clusters, and each cluster has its own characteristics. The frequency with which graduates and associate researchers borrow multimedia data is much higher. This phenomenon shows that these readers have a higher preference for accepting digitized publications. Also, the number of readers borrowing multimedia data has increased over the years. This trend indicates that readers preferences are gradually shifting towards reading digital publications.},
number = {3},
journal = {The Electronic Library},
author = {{Chan-Chine} Chang and {Ruey-Shun} Chen},
year = {2006},
keywords = {Bibliothèque numérique, Classification, Fouille de donnée},
pages = {307 -- 321} },
-
H. Azzag, C. Guinot, and G. Venturini, "Data and text mining with hierarchical clustering ants." Berlin; Heidelberg: Springer, 2006, pp. 153-189.
@incollection{azzag_data_2006, address = {Berlin; Heidelberg},
series = {Studies in computational intelligence; 34},
title = {Data and text mining with hierarchical clustering ants},
url = {http://dx.doi.org/10.1007/978-3-540-34956-3_7},
abstract = {Without Abstract},
booktitle = {Swarm intelligence in data mining},
publisher = {Springer},
author = {Hanene Azzag and Christiane Guinot and Gilles Venturini},
year = {2006},
keywords = {Fouille de donnée, Fouille de texte},
pages = {153--189} },
-
A. I. Adegorite, O. A. Basir, M. S. Kamel, and K. B. Shaban, "An approach to mining picture objects based on textual cues," in Machine learning and data mining in pattern recognition : 4th international conference, MLDM 2005, Leipzig, Germany, july 9-11, 2005 : proceedings, Berlin; Heidelberg, 2005, pp. 466-475.
@inproceedings{adegorite_approach_2005, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 3587},
title = {An approach to mining picture objects based on textual cues},
abstract = {The task of extracting knowledge from text is an important research problem for information processing and document understanding. Approaches to capture the semantics of picture objects in documents constitute subjects of great interest in the domain of document mining recently. In this paper, we present an approach to extracting information about picture objects in a document using cues from the text written about them. The goal of this work is to mine a document and understand the content of picture objects in the document based on meaning inferred from the texts written about such objects. We apply some natural language processing techniques to extract semantic information about picture objects in a document and process texts written about them. The mining algorithms were developed and implemented as a working system and gone through testing and experimentations. Results and future extensions of the work are discussed in this paper.},
booktitle = {Machine learning and data mining in pattern recognition : 4th international conference, {MLDM} 2005, Leipzig, Germany, july 9-11, 2005 : proceedings},
publisher = {Springer},
author = {A. I. Adegorite and O. A. Basir and M. S. Kamel and K. B. Shaban},
year = {2005},
keywords = {Analyse de texte, Fouille de donnée},
pages = {466--475} },
-
S. M. Weiss, N. Indurkhya, T. Zhang, and F. Damerau, Text mining : predictive methods for analyzing unstructured information, New York: Springer-Verlag, 2005.
@book{weiss_text_2005, address = {New York},
title = {Text mining : predictive methods for analyzing unstructured information},
isbn = {0387954333 {(ALK.} {PAPER)}},
abstract = {The growth of the web can be seen as an expanding public digital library collection. Online digital information extends far beyond the web and its publicly available information. Huge amounts of information are private and are of interest to local communities, such as the records of customers of a business. This information is overwhelmingly text and has its record-keeping purpose, but an automated analysis might be desirable to find patterns in the stored records. Analogous to this data mining is text mining, which also finds patterns and trends in information samples but which does so with far less structured--though with greater immediate utility for users--ingredients. This book focuses on the concepts and methods needed to expand horizons beyond structured, numeric data to automated mining of text samples. It introduces the new world of text mining and examines proven methods for various critical text-mining tasks, such as automated document indexing and information retrieval and search. New research areas are explored, such as information extraction and document summarization, that rely on evolving text-mining techniques.},
publisher = {{Springer-Verlag}},
author = {Sholom M. Weiss and Nitin Indurkhya and T. Zhang and F. Damerau},
year = {2005},
keywords = {Fouille de donnée, Fouille de texte} },
-
C. Carpineto and G. Romano, "Using concept lattices for text retrieval and mining," Formal concept analysis, vol. 3626, pp. 161-179, 2005.
@article{carpineto_using_2005, title = {Using concept lattices for text retrieval and mining},
volume = {3626},
issn = {0302-9743},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=65},
abstract = {The potentials of formal concept analysis {(FCA)} for information retrieval {(IR)} have been highlighted by a number of research studies since its inception. With the proliferation of small-size specialised text databases available in electronic format and the advent of Web-based graphical interfaces, {FCA} has then become even more appealing and practical for searching text collections. The main advantage of {FCA} for {IR} is the possibility of eliciting context, which may be used both to improve the retrieval of specific items from a text collection and to drive the mining of its contents. In this paper, we will focus on the unique features of {FCA} for building contextual {IR} applications as well as on its most critical aspects. The development of a {FCA-based} application for mining the web results returned by a major search engine is envisaged as the next big challenge for the field.},
journal = {Formal concept analysis},
author = {C Carpineto and G Romano},
year = {2005},
keywords = {Fouille de donnée, Méthodologie, Recherche d'information},
pages = {161--179} },
-
B. Grilheres, S. Canu, C. Beauce, and S. Brunessaux, "A platform for semantic annotations and ontology population using conditional random fields," , Compiegne, France, 2005, pp. 790-3.
@inproceedings{grilheres_platform_2005, address = {Compiegne, France},
series = {Proceedings. The 2005 {IEEE/WIC/ACM} International Conference on Web Intelligence},
title = {A platform for semantic annotations and ontology population using conditional random fields},
abstract = {Ontologies are widely used for organising and sharing knowledge. But elaborating these resources is a heavy and time-consuming task. This paper is two-fold: it describes {EADS} {DCS} text-mining platform, in particular, its service to annotate documents with semantic tags and it presents its extension for incremental learning of ontologies. Domain experts are assisted in the ontology population task by recent machine learning techniques (i.e. conditional random fields). Comparisons are made between annotations from the ontology and from a trained {CRF} model, so as to detect candidate instances. An iterative process controlled by the experts results in knowledge discovery and constitution of an accurate ontology},
publisher = {{IEEE} Comput. Soc},
author = {B. Grilheres and S. Canu and C. Beauce and S. Brunessaux},
year = {2005},
note = {Copyright 2006, {IEE}},
keywords = {Analyse de texte, Fouille de donnée, Ontologie},
pages = {790--3},
annote = {{\textless}p{\textgreater}8747803 semantic annotation ontology population conditional random field knowledge organisation knowledge sharing {EADS} {DCS} text-mining document annotation semantic tags incremental learning machine learning knowledge discovery{\textless}/p{\textgreater}} },
-
Pang-Ning, M. Steinbach, and V. Kumar, Introduction to data mining, Reading, Mass.: Addison Wesley, 2005.
@book{tan_introduction_2005, address = {Reading, Mass.},
title = {Introduction to data mining},
isbn = {0-321-32136-7},
publisher = {Addison Wesley},
author = {{Pang-Ning} Tan and Michael Steinbach and Vipin Kumar},
year = {2005},
keywords = {Fouille de donnée} },
-
E. Leopold, M. May, and G. Paaß, "Data mining and text mining for science \& technology research." Netherlands: Springer, 2005, pp. 187-213.
@incollection{leopold_data_2005, address = {Netherlands},
title = {Data mining and text mining for science \& technology research},
url = {http://dx.doi.org/10.1007/1-4020-2755-9_9},
abstract = {The goal of the paper is to give an overview on the state of the art of data mining and text mining approaches which are useful for bibliometrics and patent databases. The paper explains the basics of data mining in a non-technical manner. Basic approaches from statistics and machine learning are introduced in order to clarify the groundwork of data mining methods. Text mining is introduced as a special case of data mining. Data and text mining applications especially useful for bibliometrics and querying of patent databases are reviewed and three case studies are described.},
booktitle = {Handbook of quantitative science and technology research : the use of publication and patent statistics in studies of {S\&T} systems},
publisher = {Springer},
author = {Edda Leopold and Michael May and Gerhard Paaß},
year = {2005},
keywords = {Fouille de donnée, Fouille de texte},
pages = {187--213} },
-
S. Duplessie and H. Biggar, Data classification : why, when and howEntreprise Strategy Group, 2005.
@misc{duplessie_data_2005, title = {Data classification : why, when and how},
publisher = {Entreprise Strategy Group},
author = {Steve Duplessie and Heidi Biggar},
month = oct, year = {2005},
keywords = {Classification, Fouille de donnée},
annote = {{{\textless}p{\textgreater}duplessieSteve2005.pdf{\textless}/p{\textgreater}}} },
-
A. R. Martinez, E. J. Wegman, and S. J. L. C.R., "Data mining of text files." Elsevier, 2005, pp. 109-131.
@incollection{martinez_data_2005, title = {Data mining of text files},
url = {http://www.sciencedirect.com/science/article/B7P6H-4G1M8FC-7/2/5d566ae622ee51e4264e38504a4dabf6},
abstract = {The goal of this chapter is to present textual data mining from a broad perspective, in addition to discussing several methods in computational statistics that can be applied to this area. I begin by discussing natural language processing at the word and sentence level, since a textual data mining system that seeks to discover knowledge requires methods that will capture and represent the semantic content of the text units. This section includes descriptions of hidden Markov models, probabilistic context-free grammars, and various supervised and unsupervised methods for word sense disambiguation. Next, I look at approaches beyond the word and sentence level, such as vector space models for information retrieval, latent semantic indexing, and a new approach based on a bigram proximity matrix. I conclude with a brief description of self-organizing maps.},
booktitle = {Handbook of Statistics, volume 24 : data ming and data visualization},
publisher = {Elsevier},
author = {Angel R. Martinez and E. J. Wegman and J. L. Solka {C.R.} Rao},
year = {2005},
keywords = {Cluster, Découverte de connaissances, Fouille de donnée, Linguistique, Recherche d'information},
pages = {109--131} },
-
A. Amir, Y. Aumann, R. Feldman, and M. Fresko, "Maximal association rules : a tool for mining associations in text," Journal of intelligent information systems, vol. 25, iss. 3, pp. 333-345, 2005.
@article{amir_maximal_2005, title = {Maximal association rules : a tool for mining associations in text},
volume = {25},
issn = {0925-9902},
shorttitle = {Maximal association rules},
doi = {10.1007/s10844-005-0196-9},
abstract = {We describe a new tool for mining association rules, which is of special value in text mining. The new tool, called maximal associations, is geared toward discovering associations that are frequently lost when using regular association rules. Intuitively, a maximal association rule X double right arrow(max) Y says that whenever X is the only item of its type in a transaction, than Y also appears, with some confidence. Maximal associations allow the discovery of associations pertaining to items that most often do not appear alone, but rather together with closely related items, and hence associations relevant only to these items tend to obtain low confidence. We provide a formal description of maximal association rules and efficient algorithms for discovering all such associations. We present the results of applying maximal association rules to two text corpora.},
number = {3},
journal = {Journal of intelligent information systems},
author = {Amihood Amir and Yonatan Aumann and Ronen Feldman and Moshe Fresko},
month = nov, year = {2005},
keywords = {Fouille de donnée, Fouille de texte},
pages = {333--345},
annote = {{{\textless}p{\textgreater}amirAmihood2005.pdf{\textless}/p{\textgreater}}} },
-
V. Kashyap, C. Ramakrishnan, C. Thomas, and A. Sheth, "TaxaMiner : an experimentation framework for automated taxonomy bootstrapping," International Journal of Web and Grid Services, vol. 1, iss. 2, pp. 240-66, 2005.
@article{kashyap_taxaminer_2005, title = {{TaxaMiner} : an experimentation framework for automated taxonomy bootstrapping},
volume = {1},
url = {http://dx.doi.org/10.1504/IJWGS.2005.008322},
abstract = {Construction of domain ontologies on the semantic Web is a human and resource intensive process, efforts to reduce which are crucial for the semantic Web to scale. We present a framework for automated taxonomy construction, that involves: (a) generation of a cluster hierarchy from a document corpus using statistical clustering and {NLP} techniques; (b) extraction of a topic hierarchy from this cluster hierarchy; and (c) assignment of labels to nodes in the topic hierarchy. Metrics for estimating topic hierarchy quality and parameters of an experimentation framework are identified. {MEDLINE®} was the document corpus and {MeSH} thesaurus was the gold standard},
number = {2},
journal = {International Journal of Web and Grid Services},
author = {V. Kashyap and C. Ramakrishnan and C. Thomas and A. Sheth},
year = {2005},
keywords = {Classification, Fouille de donnée, Langage naturel, Ontologie, Thésaurus, Web sémantique},
pages = {240--66},
annote = {{{\textless}p{\textgreater}Copyright} 2006, The Institution of Engineering and Technology 8880213 1741-1106 {TaxaMiner} automated taxonomy bootstrapping domain ontology semantic Web document corpus cluster hierarchy generation statistical clustering natural language processing topic hierarchy extraction {MEDLINE} {MeSH} thesaurus label assignment{\textless}/p{\textgreater}} },
-
T. Takemura, K. Shimai, H. Matsui, S. Manabe, N. Ashida, and H. Yoshihara, "An extraction of medical knowledge on text mining for ubiquitous medicine," , Odawara, Japan, 2004, pp. 114-17.
-
D. J. T. L. Wang, M. J. Zaki, H. T. T. Toivonen, and D. Shasha, Data mining in bioinformatics, , 2004.
@book{wang_data_2004, title = {Data mining in bioinformatics},
isbn = {1852336714},
url = {http://books.google.ca/books?id=JLioFFPyisYC&hl=fr},
abstract = {The goal of this book is to help readers understand state-of-the-art techniques in biological data miningnbsp;and data managementnbsp;and includes topics such as: -nbsp;preprocessing tasks such as data cleaningnbsp;and data integration as applied to biological data -nbsp;classificationnbsp;and clustering techniques for microarrays -nbsp;comparison of {RNA} structures based on string propertiesnbsp;and energetics -nbsp;discovery of the sequence characteristics of different parts of the genome -nbsp;mining of haplotypes to find disease markers -nbsp;sequencing of events leading to the folding of a protein -nbsp;inference of the subcellular location of protein activity -nbsp;classification of chemical compounds based on structure -nbsp;special purpose metricsnbsp;and index structures for phylogenetic applications -nbsp;a new query language for protein searching based on the shape of proteins -nbsp;very fast indexing schemes for sequencesnbsp;and nbsp;pathways Aimed at computer scientists, necessary biology is explained.},
author = {De Jason T. L. Wang and Mohammed J. Zaki and Hannu T. T. Toivonen and Dennis Shasha},
year = {2004},
keywords = {Bio informatic, Fouille de donnée} },
-
M. W. Berry, Survey of text mining : clustering, classification, and retrieval, New York: Springer-Verlag, 2004.
@book{berry_survey_2004, address = {New York},
title = {Survey of text mining : clustering, classification, and retrieval},
isbn = {0387955631 {(ALK.} {PAPER)}},
publisher = {{Springer-Verlag}},
author = {Michael W. Berry},
year = {2004},
keywords = {Cluster, Fouille de donnée},
annote = {{{\textless}p{\textgreater}TOC} : I. Clustering and Classification -- 1. {Cluster-Preserving} Dimension Reduction Methods for Efficient Classification of Text Data / 2. Automatic Discovery of Similar Words / 3. Simultaneous Clustering and Dynamic Keyword Weighting for Text Documents / 4. Feature Selection and Document Clustering / {II.} Information Extraction and Retrieval -- 5. Vector Space Models for Search and Cluster Mining / 6. {HotMiner:} Discovering Hot Topics from Dirty Text / 7. Combining Families of Information Retrieval Algorithms Using Metalearning / {III.} Trend Detection -- 8. Trend and Behavior Detection from Web Queries / 9. A Survey of Emerging Trend Detection in Textual Data Mining /{\textless}/p{\textgreater}} },
-
J. Demsar, B. Zupan, and G. Leban, Orange : from experimental machine learning to interactive data mining, 2004.
@misc{demsar_orange_2004, title = {Orange : from experimental machine learning to interactive data mining},
url = {http://magix.fri.uni-lj.si/orange/},
journal = {University of Ljubljana},
author = {J. Demsar and B. Zupan and G. Leban},
year = {2004},
keywords = {Apprentissage machine, Fouille de donnée},
howpublished = {http://magix.fri.uni-lj.si/orange/} },
-
S. Sirmakessis, Text mining and its applications : results of the NEMIS Launch Conference, Berlin; New York: Springer-Verlag, 2004.
@book{sirmakessis_text_2004, address = {Berlin; New York},
series = {Studies in fuzziness and soft computing; 138},
title = {Text mining and its applications : results of the {NEMIS} Launch Conference},
isbn = {3540202382 {(ALK.} {PAPER)}},
publisher = {{Springer-Verlag}},
author = {Spiros Sirmakessis},
year = {2004},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}Mining} for Gems of Information / From Text to Information: Document Processing and Visualization, a Text Mining Approach / Web Mining: The Past, the Present, and Future / Applications, Sectors and Strategies of Text Mining, a First Overall Picture / Text Classification of News Articles with Support Vector Machines / A Review of Web Document Clustering Approaches / Supervised Term Weighting for Automated Text Categorization / Machine Learning for Information Extraction in Genomics - State of the Art and Perspectives / Processing Multilingual Collection for Text Mining Applications / Text Mining Tools: Evaluation Methods and Criteria / Knowledge Advantage through online Text Mining. Research Trends in Competitive Intelligence and Virtual Communities Applications / Real Time Customer Opinion Monitoring / Validation Techniques in Text Mining (with Application to the Processing of Open-ended Questions) / Clickstream Analysis, Semiotic Interpretation and Semantic Text Mining for a Distance Measurement on the Hypertextual Map of an Internet-portal / Text Mining in Official Statistic /{\textless}/p{\textgreater}} },
-
C. Clifton, R. Cooley, and J. Rennie, "TopCat : data mining for topic identification in a text corpus," IEE Transaction on Knowledge and Data Engineering, vol. 16, iss. 8, pp. 949-964, 2004.
@article{clifton_topcat_2004, title = {{TopCat} : data mining for topic identification in a text corpus},
volume = {16},
issn = {10414347},
doi = {10.1109/TKDE.2004.32},
abstract = {{TopCat} (topic categories) is a technique for identifying topics that recur in articles in a text corpus. Natural language processing techniques are used to identify key entities in individual articles, allowing us to represent an article as a set of items. This allows us to view the problem in a database/data mining context: Identifying related groups of items. We present a novel method for identifying related items based on traditional data mining techniques. Frequent itemsets are generated from the groups of items, followed by clusters formed with a hypergraph partitioning scheme. We present an evaluation against a manually categorized ground truth news corpus; it shows this technique is effective in identifying topics in collections of news articles.},
number = {8},
journal = {{IEE} Transaction on Knowledge and Data Engineering},
author = {C. Clifton and R. Cooley and J. Rennie},
year = {2004},
keywords = {Cluster, Fouille de donnée, Langage naturel},
pages = {949--964},
annote = {{{\textless}p{\textgreater}cliftonChris2004.pdf{\textless}/p{\textgreater}}} },
-
R. Mack, S. Mukherjea, A. Soffer, N. Uramoto, E. Brown, A. Coden, J. Cooper, A. Inokuchi, B. Iyer, Y. Mass, H. Matsuzawa, and L. V. Subramaniam, "Text analytics for life science using the unstructured information management architecture," IBM Systems Journal, vol. 43, iss. 3, pp. 490-515, 2004.
@article{mack_text_2004, title = {Text analytics for life science using the unstructured information management architecture},
volume = {43},
abstract = {Biomedical text plays a fundamental role in knowledge discovery in life science, in both basic research (in the field of bioinformatics) and in industry sectors devoted to improving medical practice, drug development, and health care (such as medical informatics, clinical genomics, and other sectors). Several groups in the {IBM} Research Division are collaborating on the development of a prototype system for text analysis, search, and text-mining methods to support problem solving in life science. The system is called {"BioTeKS"} (" Biological Text Knowledge Services"), and it integrates research technologies from multiple {IBM} Research labs. {BioTeKS} is also the first major application of the {UIMA} {(Unstructured} Information Management Architecture) initiative also emerging from {IBM} Research. {BioTeKS} is intended to analyze biomedical text such as {MEDLINE} abstracts, medical records, and patents; text is analyzed by automatically identifying terms or names corresponding to key biomedical entities (e.g., " genes," "proteins," "compounds," or " drugs") and concepts or facts related to them. In this paper, we describe the value of text analysis in biomedical research, the development of the {BioTeKS} system, and applications which demonstrate its functions. © 2004 {IBM.}},
number = {3},
journal = {{IBM} Systems Journal},
author = {R. Mack and S. Mukherjea and A. Soffer and N. Uramoto and E. Brown and A. Coden and J. Cooper and A. Inokuchi and B. Iyer and Y. Mass and H. Matsuzawa and L. V. Subramaniam},
year = {2004},
keywords = {Fouille de donnée, Recherche d'information},
pages = {490--515} },
-
E. C. Mavrikas, N. Nicoloyannis, and E. Kavakli, "Cultural heritage information on the semantic Web," in Engineering knowledge in the age of the semantic Web : 14th international conference, EKAW 2004 : proceedings, 2004, pp. 477-8.
@inproceedings{mavrikas_cultural_2004, series = {Lecture notes in computer science; 3257. Lecture notes in artificial intelligence},
title = {Cultural heritage information on the semantic Web},
abstract = {In this paper, we outline an ontology-driven approach to the organisation, classification, and mining of cultural heritage documents on the semantic Web. We propose its implementation as a person-machine system that uses statistical {NLP} methods to extract cultural heritage information from texts contained in distributed information sources connected within a schema-based peer-to-peer network infrastructure},
booktitle = {Engineering knowledge in the age of the semantic Web : 14th international conference, {EKAW} 2004 : proceedings},
publisher = {{Springer-Verlag}},
author = {E. C. Mavrikas and N. Nicoloyannis and E. Kavakli},
year = {2004},
note = {Copyright 2005, {IEE}},
keywords = {Fouille de donnée, Langage naturel, Ontologie, Web sémantique},
pages = {477--8},
annote = {{\textless}p{\textgreater}8303562 semantic Web ontology-driven approach cultural heritage document mining person-machine system statistical natural language processing methods distributed information sources peer-to-peer network{\textless}/p{\textgreater}} },
-
E. G. Bremer, J. Natarajan, Z. Yonghong, C. DeSesa, C. J. Hack, and W. Dubitzky, "Text mining of full text articles and creation of a knowledge base for analysis of microarray data," in Knowledge exploration in life science informatics : international symposium KELSI 2004, Milan, Italy, november 25-26, 2004 : proceedings, 2004, pp. 84-95.
@inproceedings{bremer_text_2004, series = {Lecture notes in computer science; 3303},
title = {Text mining of full text articles and creation of a knowledge base for analysis of microarray data},
abstract = {Automated extraction of information from biological literature promises to play an increasingly important role in text-based knowledge discovery processes. This is particularly true in regards to high throughput approaches such as microarrays and combining data from different sources in a systems biology approach. We have developed an integrated system that combines protein/gene name dictionaries, synonymy dictionaries, natural language processing, and pattern matching rules to extract and organize gene relationships from full text articles. In the first phase full text articles were collected from 20 peer-reviewed journals in the field of molecular biology and biomedicine over the last 5 years (1999-2003). The extracted relationships were organized in a database that included the unique {PubMed} {ID} and section id (abstract, introduction, materials and method, and results and discussion) to identify the source article and section from which concepts were extracted. The system architecture, its uniqueness and advantages are presented in this paper. It is hoped that the resulting knowledge base will assist in the understanding of gene lists generated from microarray experiments},
booktitle = {Knowledge exploration in life science informatics : international symposium {KELSI} 2004, Milan, Italy, november 25-26, 2004 : proceedings},
publisher = {{Springer-Verlag}},
author = {E. G. Bremer and J. Natarajan and Zhang Yonghong and C. {DeSesa} and C. J. Hack and W. Dubitzky},
year = {2004},
note = {Copyright 2005, {IEE}},
keywords = {Analyse documentaire, Fouille de texte},
pages = {84--95} },
-
B. Dai, "A road map to more effective web personalization : integrating domain knowledge with web usage mining." 2003.
@inproceedings{dai_road_2003, title = {A road map to more effective web personalization : integrating domain knowledge with web usage mining},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.12.2273},
abstract = {Personalization based on Web usage mining can enhance the effectiveness and scalability of collaborative filtering. However, without semantic knowledge about the underlying domain, such systems cannot recommend different types of complex objects based in their underlying properties and attributes. This paper provides an overview of approaches for incorporating semantic knowledge into Web usage mining and personalization processes. We present two general approaches to integrate semantic knowledge extracted from the content features of pages into the usage-based personalization process. Next, we present a general framework of integrating domain ontologies with},
author = {B. Dai},
year = {2003},
keywords = {Fouille de donnée, Web},
annote = {{\textless}p{\textgreater}overview Personnalisation Web{\textless}/p{\textgreater}} },
-
N. Ye, The handbook of data mining, Mahwah, N.J.: Lawrence Erlbaum Associates, 2003.
@book{ye_handbook_2003, address = {Mahwah, {N.J.}},
series = {Human factors and ergonomics},
title = {The handbook of data mining},
isbn = {0805840818},
publisher = {Lawrence Erlbaum Associates},
author = {Nong Ye},
year = {2003},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}TOC} : Foreword / Preface / I. Methodologies of Data Mining -- 1. Decision Trees / 2. Association Rules / 3. Artificial Neural Network Models for Data Mining / 4. Statistical Analysis of Normal and Abnormal Data / 5. Bayesian Data Analysis / 6. Hidden Markov Processes and Sequential Pattern Mining / 7. Strategies and Methods for Prediction / 8. Principal Components and Factor Analysis / 9. Psychometric Methods of Latent Variable Modeling / 10. Scalable Clustering / 11. Time Series Similarities and Indexing / 12. Nonlinear Time Series Analysis / 13. Distributed Data Mining / {II.} Management of Data Mining -- 14. Data Collection, Preparation, Quality, and Visualization / 15. Data Storage and Management / 16. Feature Extraction, Selection, and Construction / 17. Performance Analysis and Evaluation / 18. Security and Privacy / 19. Emerging Standards and Interfaces / {III.} Applications of Data Mining -- 20. Mining Human Performance Data / 21. Mining Text Data / 22. Mining Geospatial Data / 23. Mining Science and Engineering Data / 24. Mining Data in Bioinformatics / 25. Mining Customer Relationship Management {(CRM)} Data / 26. Mining Computer and Network Security Data / 27. Mining Image Data / 28. Mining Manufacturing Quality Data /{\textless}/p{\textgreater}} },
-
E. Hochsztain, S. Millán, B. Pardo, J. Peña, and E. Menasalvas, "A framework to integrate business goals in Web usage mining," in Advances in Web intelligence : first international Atlantic Web intelligence conference, AWIC 2003, Madrid, Spain, may 2003 : proceedings, New York, 2003, pp. 28-36.
@inproceedings{hochsztain_framework_2003, address = {New York},
series = {Lecture notes in computer science; 2663},
title = {A framework to integrate business goals in Web usage mining},
abstract = {Web mining is a broad term that has been used to refer to the process of information discovery from Web sources: content, structure, and usage. Information collected by web servers and kept in the server log is the main source of data for analyzing user navigation patterns. Notwithstanding, knowing the most frequent user paths is not enough: it is necessary to integrate web mining with the company site goals in order to make sites more competitive. The concept of Web Goal Mining is introduced in this paper to refer to the process information discovery of the relationship between site visitors and sponsor goals.},
booktitle = {Advances in Web intelligence : first international Atlantic Web intelligence conference, {AWIC} 2003, Madrid, Spain, may 2003 : proceedings},
publisher = {Springer},
author = {E. Hochsztain and S. Millán and B. Pardo and J. Peña and E. Menasalvas},
year = {2003},
keywords = {Fouille de donnée, Fouille de texte},
pages = {28--36},
annote = {{{\textless}p{\textgreater}hochsztainE2003.pdf{\textless}/p{\textgreater}}} },
-
M. Huisman and M. V. A. J. Duijn, "Software for statistical analysis of social networks," Connections, vol. 25, pp. 7-26, 2003.
@article{huisman_software_2003, title = {Software for statistical analysis of social networks},
volume = {25},
url = {http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=9AE0B44C613542212FF9AA8E4F55314C?doi=10.1.1.105.9814&rep=rep1&type=pdf},
doi = {10.1.1.105.9814},
abstract = {This paper gives a state-of-the-art overview of available software for the statistical analysis of social networks as of Summer 2004. It reviews and compares software packages for social network analysis with respect to their statistical procedures, illustrating some procedures with example data. The choice of routines that were inspected is restricted to procedures for statistical modeling based on probability distributions (e.g., exponential random graph models, {QAP} correlation, statistical analysis of longitudinal network data). This definition of analysis routines excludes the extensive review of procedure-based routines based on more complex (iterative) algorithms like cluster analysis or eigendecompositions. The paper concludes with some recommendations. Key words: exponential random graph model, longitudinal network data, statistical modelling, software packages, permutation tests.},
journal = {Connections},
author = {Mark Huisman and Marijtje A. J Van Duijn},
year = {2003},
keywords = {Approche statistique, Fouille de donnée},
pages = {7--26},
annote = {{{\textless}p{\textgreater}huismanMark2003.pdf{\textless}/p{\textgreater}}} },
-
D. H. Kraft, M. J. Martin-Bautista, J. Chen, and D. Sanchez, "Rules and fuzzy rules in text : concept, extraction and usage," International Journal of Approximate Reasoning, vol. 34, iss. 2-3, pp. 145-161, 2003.
@article{kraft_rules_2003, title = {Rules and fuzzy rules in text : concept, extraction and usage},
volume = {34},
url = {http://dx.doi.org/10.1016/j.ijar.2003.07.005},
abstract = {Several concepts and techniques have been imported from other disciplines such as machine learning and artificial intelligence to the field of textual data. We focus on the concept of rule and the management of uncertainty in text applications. The different structures considered for the construction of the rules, the extraction of the knowledge base and the applications and usage of these rules are detailed. We include a review of the most relevant works of the different types of rules based on their representation and their application to most of the common tasks of information retrieval such as categorization, indexing and classification},
number = {2-3},
journal = {International Journal of Approximate Reasoning},
author = {D. H. Kraft and M. J. {Martin-Bautista} and J. Chen and D. Sanchez},
year = {2003},
keywords = {Fouille de donnée, Fuzzy, Recherche d'information},
pages = {145--161},
annote = {{{\textless}p{\textgreater}Copyright} 2004, {IEE} 8148260 {0888-613X} fuzzy rules machine learning artificial intelligence textual data knowledge base system information retrieval fuzzy logic association rule{\textless}/p{\textgreater}} },
-
D. Y. Kambayashi, M. Mohania, and W. Wöss, Data warehousing and knowledge discovery 5th international conference, DaWaK, , 2003.
@book{kambayashi_data_2003, title = {Data warehousing and knowledge discovery 5th international conference, {DaWaK}},
isbn = {{354040807X}},
url = {http://books.google.ca/books?id=mnpWayx8h7gC&hl=fr},
abstract = {This book constitutes the refereed proceedings of the 5th International Conference on Data Warehousing and Knowledge Discovery, {DaWaK} 2003, held in Prague, Czech Republic in September 2003. The 41 revised full papers presented were carefully reviewed and selected from more than 130 submissions. The papers are organized in topical sections on data cubes and queries, multidimensional data models, Web warehousing, change detection, Web mining and association rules, association rules and decision trees, clustering, association rule mining, data analysis and discovery, ontologies and improving data quality, queries and data patterns, improving database query engines, and sampling and vector classification.},
author = {De Y. Kambayashi and Mukesh Mohania and Wolfram Wöss},
year = {2003},
keywords = {Découverte de connaissances, Fouille de donnée} },
-
J. Atkinson-Abutridy, C. Mellish, and S. Aitken, "A semantically guided and domain-independent evolutionary model for knowledge discovery from texts," IEEE Transactions on Evolutionary Computation, vol. 7, iss. 6, pp. 546-560, 2003.
@article{atkinson-abutridy_semantically_2003, title = {A semantically guided and domain-independent evolutionary model for knowledge discovery from texts},
volume = {7},
abstract = {We present a novel evolutionary model for knowledge discovery from texts {(KDTs),} which deals with issues concerning shallow text representation and processing for mining purposes in an integrated way. Its aims is to look for novel and interesting explanatory knowledge across text documents. The approach uses natural language technology and genetic algorithms to produce explanatory novel hypotheses. The proposed approach is interdisciplinary, involving concepts not only from evolutionary algorithms but also from many kinds of text mining methods. Accordingly, new kinds of genetic operations suitable for text mining are proposed. The principles behind the representation and a new proposal for using multiobjective evaluation at the semantic level are described. Some promising results and their assessment by human experts are also discussed which indicate the plausibility of the model for effective {KDT.}},
number = {6},
journal = {{IEEE} Transactions on Evolutionary Computation},
author = {John {Atkinson-Abutridy} and Chris Mellish and Stuart Aitken},
year = {2003},
keywords = {Fouille de donnée, Recherche d'information},
pages = {546--560},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 04057997412 {1089-778X} Knowledge discovery from texts Knowledge discovery in databases Information extraction Text mining methods{\textless}/p{\textgreater}} },
-
L. Massey, "Evaluating quality of text clustering with ART1," in Neural Networks, 2003. Proceedings of the International Joint Conference on, 2003, pp. 1402-1407.
@inproceedings{massey_evaluating_2003, title = {Evaluating quality of text clustering with {ART1}},
volume = {2},
isbn = {0780378989},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1223901&isnumber=27486},
doi = {10.1109/IJCNN.2003.1223901},
abstract = {Self-organizing large amounts of textual data in accordance to some topics structure is an increasingly important application of clustering. Adaptive resonance theory {(ART)} neural networks possess several interesting properties that make them appealing in this area. Although {ART} has been used in several research works as a text clustering tool, the level of quality of the resulting document clusters has not been clearly established yet. In this paper, we present experimental results with binary {ART} that address this issue by determining how close clustering quality is to an upper bound on clustering quality.},
booktitle = {Neural Networks, 2003. Proceedings of the International Joint Conference on},
author = {Louis Massey},
year = {2003},
keywords = {Cluster, Fouille de donnée, Fuzzy, Recherche d'information},
pages = {1402--1407},
annote = {{{\textless}p{\textgreater}masseyLouis2003.pdf{\textless}/p{\textgreater}}} },
-
B. M. Thuraisingham, Web data mining and applications in business intelligence and counter-terrorism, Auerbach Publications ed., , 2003.
@book{thuraisingham_web_2003, edition = {Auerbach Publications},
title = {Web data mining and applications in business intelligence and counter-terrorism},
isbn = {9780849314605},
url = {http://library.books24x7.com/toc.asp?bookid=5976},
abstract = {Armed with the knowledge contained in this book, businesses can collect and analyze Web-based data to help develop customer relationships, increase sales, and identify existing and potential threats.},
author = {Bhavani M. Thuraisingham},
year = {2003},
keywords = {Business intelligence, Fouille de donnée, Web},
annote = {{{\textless}p{\textgreater}Accessible} en ligne via Books24x7 (http://library.books24x7.com/toc.asp?bookid=5976){\textless}/p{\textgreater}} },
-
P. Berkhin, "Survey of clustering data mining techniques," , 2002.
@article{berkhin_survey_2002, title = {Survey of clustering data mining techniques},
url = {www.ee.ucr.edu/~barth/EE242/clustering_survey.pdf},
doi = {10.1.1.18.3739},
author = {Pavel Berkhin},
year = {2002},
keywords = {Cluster, Fouille de donnée},
annote = {{{\textless}p{\textgreater}berkhinPavel2002.pdf{\textless}/p{\textgreater}}} },
-
W. Klösgen and J. M. Bytkow, Handbook of data mining and knowledge discovery, Oxford ; New York: Oxford University Press, 2002.
@book{klsgen_handbook_2002, address = {Oxford ; New York},
title = {Handbook of data mining and knowledge discovery},
isbn = {0195118316 {(ALK.} {PAPER)}},
publisher = {Oxford University Press},
author = {Willi Klösgen and Jan M. Bytkow},
year = {2002},
keywords = {Découverte de connaissances, Fouille de donnée} },
-
C. C. Chen, M. C. Chen, and Y. Sun, "PVA: a self-adaptive personal view agent," Journal of intelligent information systems, vol. 18, iss. 2-3, pp. 173-194, 2002.
@article{chen_pva:self-adaptive_2002, title = {{PVA:} a self-adaptive personal view agent},
volume = {18},
url = {http://www.springerlink.com/content/x224178u6q54389r/fulltext.pdf},
doi = {10.1023/A:1013629527840},
abstract = {In this paper, we present {PVA,} an adaptive personal view information agent system for tracking, learning and managing user interests in Internet documents. {PVA} consists of three parts: a proxy, personal view constructor, and personal view maintainer. The proxy logs the user's activities and extracts the user's interests without user intervention. The personal view constructor mines user interests and maps them to a class hierarchy (i.e., personal view). The personal view maintainer synchronizes user interests and the personal view periodically. When user interests change, in {PVA,} not only the contents, but also the structure of the user profile are modified to adapt to the changes. In addition, {PVA} considers the aging problem of user interests. The experimental results show that modulating the structure of the user profile increases the accuracy of a personalization system.},
number = {2-3},
journal = {Journal of intelligent information systems},
author = {Chien Chin Chen and Meng Chang Chen and Yeali Sun},
year = {2002},
keywords = {Fouille de donnée, Web},
pages = {173--194},
annote = {{{\textless}p{\textgreater}chenChien2002.pdf{\textless}/p{\textgreater}}} },
-
J. Grabmeier and A. Rudolph, "Techniques of cluster algorithms in data mining," Data mining and knowledge discovery, vol. 6, iss. 4, pp. 303-360, 2002.
@article{grabmeier_techniques_2002, title = {Techniques of cluster algorithms in data mining},
volume = {6},
issn = {1384-5810 {(Print)} {1573-756X} {(Online)}},
url = {http://www.springerlink.com/content/d6ekxxcu0d2ngamj/fulltext.pdf},
doi = {10.1023/A:1016308404627},
abstract = {An overview of cluster analysis techniques from a data mining point of view is given. This is done by a strict separation of the questions of various similarity and distance measures and related optimization criteria for clusterings from the methods to create and modify clusterings themselves. In addition to this general setting and overview, the second focus is used on discussions of the essential ingredients of the demographic cluster algorithm of {IBM's} Intelligent Miner, based Condorcet's criterion.},
number = {4},
journal = {Data mining and knowledge discovery},
author = {Johannes Grabmeier and Andreas Rudolph},
year = {2002},
keywords = {Cluster, Fouille de donnée},
pages = {303--360},
annote = {{{\textless}p{\textgreater}grabmeierJohannes2002.pdf{\textless}/p{\textgreater}}} },
-
M. Wasson, Data mining and text-based information, 2002.
@misc{wasson_data_2002, type = {{PowerPoint}},
title = {Data mining and text-based information},
url = {http://www.asis.org/Chapters/soasis/events/20020827.ppt},
author = {Mark Wasson},
month = aug, year = {2002},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}wassonMark2002.ppt{\textless}/p{\textgreater}}} },
-
Y. Fu, T. Bauer, J. Mostafa, M. Palakal, and S. Mukhopadhyay, "Concept extraction and association from cancer literature," , McLean, VA, United States, 2002, pp. 100-103.
@inproceedings{fu_concept_2002, address = {{McLean,} {VA,} United States},
series = {Proceedings of the Interntational Workshop on Web Information and Data Management},
title = {Concept extraction and association from cancer literature},
url = {http://dx.doi.org/10.1145/584931.584953 http://dx.doi.org/10.1145/584931.584953},
abstract = {There is a large and growing body of web accessible biomedical literature. As this body of electronic literature grows, so does the possibility that document analysis techniques can be used to automatically extract useful biomedical information from them, particularly in the discovery of key concepts dealing with genes, proteins, drugs, and diseases and associations among these concepts. {VCGS} {(Vocabulary} Cluster Generating System) was designed to automatically extract and determine associations among tokens from a subset of biomedical literature namely cancer. Such information has notable potential to automate database construction in biomedicine, instead of relying on experts' analysis. This paper reports on the mechanisms for automatically generating clusters of tokens. A formal evaluation of the system, based on a subset of 5338 Pubmed titles and abstracts, has been conducted against the {Swiss-Prot} database in which the associations among concepts are entered by experts by hand.},
publisher = {Association for Computing Machinery},
author = {Yueyu Fu and Travis Bauer and Javed Mostafa and Mathew Palakal and Snehasis Mukhopadhyay},
year = {2002},
keywords = {Fouille de donnée, Langage naturel, Web},
pages = {100--103},
annote = {{{\textless}p{\textgreater}Compilation} and indexing terms, Copyright 2007 Elsevier Inc. All rights reserved 04128070260 Web data mining Web information extraction Natural language processing {(NLP)} techniques{\textless}/p{\textgreater}} },
-
G. Balmisse, Le Web miningKm Center, 2002.
@misc{balmisse_le_2002, title = {Le Web mining},
abstract = {On peut définir le web mining comme étant l'application des techniques du data mining à l'exploitation des données disponibles sur le web. Le data mining consiste à utiliser un ensemble de techniques statistiques qui, en " fouillant " un grand nombre de données structurées, permettent de découvrir et de présenter des informations à valeur ajoutée dans une forme interprétable facilement par un individu.},
publisher = {Km Center},
author = {Gilles Balmisse},
month = sep, year = {2002},
keywords = {Fouille de donnée, Web},
annote = {{{\textless}p{\textgreater}balmisseGilles2002\_3.pdf{\textless}/p{\textgreater}}} },
-
C. Haruechaiyasak, Mei-Ling, Shu-Ching, and X. Li, "Web document classification based on fuzzy association," in Proceedings of the 26th international computer software and applications conference on prolonging software life : development and redevelopment, Washington, DC, 2002, pp. 487-492.
@inproceedings{haruechaiyasak_web_2002, address = {Washington, {DC}},
title = {Web document classification based on fuzzy association},
isbn = {0-7695-1727-7},
abstract = {In this paper, a method of automatically classifying Web documents into a set of categories using the fuzzy association concept is proposed. Using the same word or vocabulary to describe different entities creates ambiguity, especially in the Web environment where the user population is large. To solve this problem, fuzzy association is used to capture the relationships among different index terms or keywords in the documents, i.e., each pair of words has an associated value to distinguish itself from the others. Therefore, the ambiguity in word usage is avoided. Experiments using data sets collected from two Web portals: Yahoo! (www.yahoo.com) and Open Directory Project (dmoz.org) are conducted. We compare our approach to the vector space model with the cosine coefficient. The results show that our approach yields higher accuracy compared to the vector space model.},
booktitle = {Proceedings of the 26th international computer software and applications conference on prolonging software life : development and redevelopment},
publisher = {{IEEE} Computer Society},
author = {Choochart Haruechaiyasak and {Mei-Ling} Shyu and {Shu-Ching} Chen and Xiuqi Li},
year = {2002},
keywords = {Classification, Fouille de donnée, Fuzzy, Web},
pages = {487--492},
annote = {{{\textless}p{\textgreater}haruechaiyasakChoochart2002.pdf{\textless}/p{\textgreater}}} },
-
P. Bajcsy, Introduction of data mining, 2002.
@misc{bajcsy_introduction_2002, type = {{PowerPoint}},
title = {Introduction of data mining},
url = {http://algdocs.ncsa.uiuc.edu/PR-20021116-1.ppt},
author = {Peter Bajcsy},
year = {2002},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}bajcsyPeter2002.ppt{\textless}/p{\textgreater}}} },
-
B. Berendt, A. Hotho, and G. Stumme, "Towards semantic web mining." 2002.
@inproceedings{berendt_towards_2002, title = {Towards semantic web mining},
url = {http://citeseer.ist.psu.edu/berendt02towards.html},
abstract = {Semantic Web Mining aims at combining the two fast-developing research areas Semantic Web and Web Mining. The idea is to improve, on the one hand, the results of Web Mining by exploiting the new semantic structures in the Web; and to make use of Web Mining, on the other hand, for building up the Semantic Web. This paper gives an overview of where the two areas meet today, and sketches ways of how a closer integration could be profitable.},
author = {B. Berendt and A. Hotho and G. Stumme},
year = {2002},
keywords = {Fouille de donnée, Web sémantique} },
-
W. Hsu, M. L. Lee, and J. Zhang, "Image mining : trends and developments," Journal of intelligent information systems, vol. 19, iss. 1, pp. 7-23, 2002.
@article{hsu_image_2002, title = {Image mining : trends and developments},
volume = {19},
issn = {0925-9902 {(Print)} 1573-7675 {(Online)}},
shorttitle = {Image Mining},
url = {http://www.springerlink.com/content/7ty2k1dfh1duv6vv/fulltext.pdf},
doi = {10.1023/A:1015508302797},
abstract = {Advances in image acquisition and storage technology have led to tremendous growth in very large and detailed image databases. These images, if analyzed, can reveal useful information to the human users. Image mining deals with the extraction of implicit knowledge, image data relationship, or other patterns not explicitly stored in the images. Image mining is more than just an extension of data mining to image domain. It is an interdisciplinary endeavor that draws upon expertise in computer vision, image processing, image retrieval, data mining, machine learning, database, and artificial intelligence. In this paper, we will examine the research issues in image mining, current developments in image mining, particularly, image mining frameworks, state-of-the-art techniques and systems. We will also identify some future research directions for image mining.},
number = {1},
journal = {Journal of intelligent information systems},
author = {Wynne Hsu and Mong Li Lee and Ji Zhang},
month = jul, year = {2002},
keywords = {Fouille de donnée},
pages = {7--23},
annote = {{{\textless}p{\textgreater}hsuWynne2002.pdf{\textless}/p{\textgreater}}} },
-
P. Giudici, D. Heckerman, and J. Whittaker, "Statistical models for data mining," Data mining and knowledge discovery, vol. 5, iss. 3, pp. 163-165, 2001.
@article{giudici_statistical_2001, title = {Statistical models for data mining},
volume = {5},
issn = {1384-5810 {(Print)} {1573-756X} {(Online)}},
url = {http://www.springerlink.com/content/x474882357636616/fulltext.pdf},
doi = {10.1023/A:1011452614423},
abstract = {We review the background to the papers presented in this special issue and give a short introduction to each. We also briefly describe the workshop on {“Statistical} models for data mining”, held in Pavia {(Italy),} in October 2000, where the papers were presented.},
number = {3},
journal = {Data mining and knowledge discovery},
author = {Paolo Giudici and David Heckerman and Joe Whittaker},
month = jul, year = {2001},
keywords = {Approche statistique, Fouille de donnée},
pages = {163--165},
annote = {{{\textless}p{\textgreater}giudiciPaolo2001.pdf{\textless}/p{\textgreater}}} },
-
Y. Kodratoff, "Comparing machine learning and knowledge discovery in databases : an application to knowledge discovery in texts." Berlin, Heidelberg: Springer, 2001, pp. 1-21.
@incollection{kodratoff_comparing_2001, address = {Berlin, Heidelberg},
series = {Lecture notes in computer science; 2049. Lecture notes in artificial intelligence},
title = {Comparing machine learning and knowledge discovery in databases : an application to knowledge discovery in texts},
isbn = {3-540-42490-1},
booktitle = {Machine learning and its applications : advanced lectures},
publisher = {Springer},
author = {Yves Kodratoff},
year = {2001},
keywords = {Fouille de donnée, Intelligence artificielle},
pages = {1--21},
annote = {{{\textless}p{\textgreater}kodratoffYves2001.pdf{\textless}/p{\textgreater}}} },
-
T. Hastie, R. Tibshirani, and J. H. Friedman, The elements of statistical learning : data mining, inference, and prediction, New York: Springer, 2001.
@book{hastie_elements_2001, address = {New York},
series = {Springer series in statistics},
title = {The elements of statistical learning : data mining, inference, and prediction},
isbn = {0387952845 {(ALK.} {PAPER)}},
publisher = {Springer},
author = {Trevor Hastie and Robert Tibshirani and J. H. Friedman},
year = {2001},
keywords = {Apprentissage machine, Approche statistique, Fouille de donnée} },
-
D. J. Hand, H. Mannila, and P. Smyth, Principles of data mining, Cambridge, Mass.: MIT Press, 2001.
@book{hand_principles_2001, address = {Cambridge, Mass.},
series = {Adaptive computation and machine learning},
title = {Principles of data mining},
isbn = {{026208290X} {(HC.} {ALK.} {PAPER)}},
publisher = {{MIT} Press},
author = {D. J. Hand and Heikki Mannila and Padhraic Smyth},
year = {2001},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}TOC} : 1. Introduction -- 2. Measurement and Data -- 3. Visualizing and Exploring Data -- 4. Data Analysis and Uncertainty -- 5. A Systematic Overview of Data Mining Algorithms -- 6. Models and Patterns -- 7. Score Functions for Data Mining Algorithms -- 8. Search and Optimization Methods -- 9. Descriptive Modeling -- 10. Predictive Modeling for Classification -- 11. Predictive Modeling for Regression -- 12. Data Organization and Databases -- 13. Finding Patterns and Rules -- 14. Retrieval by Content -- App. Random Variables.{\textless}/p{\textgreater}} },
-
U. Fayyad, G. G. Grinstein, and A. Wierse, Information visualization in data mining and knowledge discovery, San Francisco: Morgan Kaufmann, 2001.
@book{fayyad_information_2001, address = {San Francisco},
title = {Information visualization in data mining and knowledge discovery},
publisher = {Morgan Kaufmann},
author = {Usama Fayyad and Georges G. Grinstein and Andreas Wierse},
year = {2001},
keywords = {Fouille de donnée, Visualisation de l'information} },
-
B. Scneiderman, "Inventing discovery tools : combining information visualization with data mining," Institute for systems research2001.
@techreport{scneiderman_inventing_2001, title = {Inventing discovery tools : combining information visualization with data mining},
abstract = {The growing use of information visualization tools and data mining algorithms stems from two separate lines of research. Information visualization researchers believe in the importance of giving users an overview and insight into the data distributions, while data mining researchers believe that statistical algorithms and machine learning can be relied on to find the interesting patterns. This paper discusses two issues that influence design of discovery tools: statistical algorithms vs. visual data presentation, and hypothesis testing vs. exploratory data analysis. I claim that a combined approach could lead to novel discovery tools that preserve user control, enable more effective exploration, and promote responsibility.},
institution = {Institute for systems research},
author = {Ben Scneiderman},
year = {2001},
keywords = {Fouille de donnée, Visualisation de l'information},
pages = {17--28},
annote = {{{\textless}p{\textgreater}scneidermanBen2001.pdf{\textless}/p{\textgreater}}} },
-
J. Han and M. Kamber, Data mining : concepts and techniques, San Francisco: Morgan Kaufmann Publishers, 2001.
@book{han_data_2001, address = {San Francisco},
title = {Data mining : concepts and techniques},
isbn = {1558604898},
publisher = {Morgan Kaufmann Publishers},
author = {Jiawei Han and Micheline Kamber},
year = {2001},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}TOC} : Ch. 1. Introduction. 1.1. What Motivated Data Mining? Why Is It Important? 1.2. So, What Is Data Mining? 1.3. Data Mining - On What Kind of Data? 1.4. Data Mining Functionalities - What Kinds of Patterns Can Be Mined? 1.5. Are All of the Patterns Interesting? 1.6. Classification of Data Mining Systems. 1.7. Major Issues in Data Mining -- Ch. 2. Data Warehouse and {OLAP} Technology for Data Mining. 2.1. What Is a Data Warehouse? 2.2. A Multidimensional Data Model. 2.3. Data Warehouse Architecture. 2.4. Data Warehouse Implementation. 2.5. Further Development of Data Cube Technology. 2.6. From Data Warehousing to Data Mining -- Ch. 3. Data Preprocessing. 3.1. Why Preprocess the Data? 3.2. Data Cleaning. 3.3. Data Integration and Transformation. 3.4. Data Reduction. 3.5. Discretization and Concept Hierarchy Generation -- Ch. 4. Data Mining Primitives, Languages, and System Architectures. 4.1.++ Data Mining Primitives: What Defines a Data Mining Task? 4.2. A Data Mining Query Language. 4.3. Designing Graphical User Interfaces Based on a Data Mining Query Language. 4.4. Architectures of Data Mining Systems -- Ch. 5. Concept Description: Characterization and Comparison. 5.1. What Is Concept Description? 5.2. Data Generalization and {Summarization-Based} Characterization. 5.3. Analytical Characterization: Analysis of Attribute Relevance. 5.4. Mining Class Comparisons: Discriminating between Different Classes. 5.5. Mining Descriptive Statistical Measures in Large Databases. 5.6. Discussion -- Ch. 6. Mining Association Rules in Large Databases. 6.1. Association Rule Mining. 6.2. Mining {Single-Dimensional} Boolean Association Rules from Transactional Databases. 6.3. Mining Multilevel Association Rules from Transaction Databases. 6.4. Mining Multidimensional Association Rules from Relational Databases and Data Warehouses.++ 6.5. From Association Mining to Correlation Analysis. 6.6. {Constraint-Based} Association Mining -- Ch. 7. Classification and Prediction. 7.1. What Is Classification? What Is Prediction? 7.2. Issues Regarding Classification and Prediction. 7.3. Classification by Decision Tree Induction. 7.4. Bayesian Classification. 7.5. Classification by Backpropagation. 7.6. Classification Based on Concepts from Association Rule Mining. 7.7. Other Classification Methods. 7.8. Prediction. 7.9. Classifier Accuracy -- Ch. 8. Cluster Analysis. 8.1. What Is Cluster Analysis? 8.2. Types of Data in Cluster Analysis. 8.3. A Categorization of Major Clustering Methods. 8.4. Partitioning Methods. 8.5. Hierarchical Methods. 8.6. {Density-Based} Methods. 8.7. {Grid-Based} Methods. 8.8. {Model-Based} Clustering Methods. 8.9. Outlier Analysis -- Ch. 9. Mining Complex Types of Data. 9.1.++ Multidimensional Analysis and Descriptive Mining of Complex Data Objects. 9.2. Mining Spatial Databases. 9.3. Mining Multimedia Databases. 9.4. Mining {Time-Series} and Sequence Data. 9.5. Mining Text Databases. 9.6. Mining the World Wide Web -- Ch. 10. Applications and Trends in Data Mining. 10.1. Data Mining Applications. 10.2. Data Mining System Products and Research Prototypes. 10.3. Additional Themes on Data Mining. 10.4. Social Impacts of Data Mining. 10.5. Trends in Data Mining. App. A. An Introduction to Microsoft\'s {OLE} {DB} for Data Mining -- App. B. An Introduction to {DBMiner.{\textless}/p{\textgreater}}} },
-
P. C. Wong, "Visual data mining," Computer graphics and applications, IEEE, vol. 19, iss. 5, pp. 20-21, 1999.
@article{wong_visual_1999, title = {Visual data mining},
volume = {19},
issn = {0272-1716},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=788794&isnumber=17095},
doi = {10.1109/MCG.1999.788794},
number = {5},
journal = {Computer graphics and applications, {IEEE}},
author = {Pak Chung Wong},
year = {1999},
keywords = {Fouille de donnée, Visualisation de l'information},
pages = {20--21},
annote = {{{\textless}p{\textgreater}wongPak1999.pdf{\textless}/p{\textgreater}}} },
-
T. C. Corporation, Introduction to data mining, 1999.
@misc{two_crows_corporation_introduction_1999, title = {Introduction to data mining},
url = {http://www.twocrows.com/intro-dm.pdf},
abstract = {{"Introduction} to Data Mining and Knowledge Discovery, Third Edition" is a valuable educational tool for prospective users. It provides a clear, non-technical overview of the techniques and capabilities of data mining. Available as a {PDF} file, the contents have been bookmarked for your convenience.},
author = {Two Crows Corporation},
year = {1999},
keywords = {Fouille de donnée},
annote = {{\textless}p{\textgreater}tcc1999.pdf{\textless}/p{\textgreater}} },
-
P. Perner and M. Petrou, Machine learning and data mining in pattern recognition : first International Workshop, MLDM’99, Leipzig, Germany, September 16-18, 1999 : proceedings, Berlin ; New York: Springer, 1999.
@book{perner_machine_1999, address = {Berlin ; New York},
title = {Machine learning and data mining in pattern recognition : first International Workshop, {MLDM'99,} Leipzig, Germany, September 16-18, 1999 : proceedings},
isbn = {3540665994 {(SOFTCOVER)}},
publisher = {Springer},
author = {Petra Perner and Maria Petrou},
year = {1999},
keywords = {Apprentissage machine, Fouille de donnée},
annote = {{{\textless}p{\textgreater}MLDM\'99} (1st : 1999 : Leipzig, Germany) Petra Perner, Maria Petrou (eds.). ill. ; 24 cm. Learning in Pattern Recognition / Advances in Predictive Data Mining Methods / Multi-valued and Universal Binary Neurons: Learning Algorithms, Application to Image Processing and Recognition / A Dynamics of the Hough Transform and Artificial Neural Networks / Applications of Cellular Neural Networks for Shape from Shading Problem / Unsupervised Learning of Local Mean Grey Values for Image {Pre-Processing} / Neural Networks in {MR} Image Estimation from Sparsely Sampled Scans / Extraction of Local Structural Features in Images by Using a Multi-scale Relevance Function / Independent Feature Analysis for Image Retrieval / Non-hierarchical Clustering with Rival Penalized Competitive Learning for Information Retrieval / Automatic Design of Multiple Classifier Systems by Unsupervised Learning / A Comparison between Neural Networks and Decision Trees / Symbolic Learning Techniques in Paper Document Processing / Recognition of Printed Music Score / Reproductive {Process-Oriented} Data Mining From Interactions between Human and Complex {ArtifactSystem} / Generalized Fuzzy Aggregation Operators / A Data Mining Application for Monitoring Environmental Risks /{\textless}/p{\textgreater}} },
-
C. A. Barry, Choosing qualitative data analysis software : Atlas/ti and Nudist compared, 1998.
@misc{barry_choosing_1998, title = {Choosing qualitative data analysis software : Atlas/ti and Nudist compared},
url = {http://www.socresonline.org.uk/3/3/4.html},
abstract = {Choosing between Nudist and Atlas/ti, the main qualitative data analysis software packages can be difficult. To assist researchers in making this choice, I have conceptualised their differences along two dimensions, related to the qualities of the software and of the research project. The software dimension is structural design, and the project dimension is complexity. Software structure is dichotomised between structured, sequential, verbal versus visual, spatial, interconnected modes of operation. Projects are dichotomised between homogeneous sample, short timeframe, single data-type, single data analyst; versus multiple samples, longitudinal data, multiple data types and team data analysis. First I review the {CAQDAS} literature. Then I outline the different personalities and strengths of Atlas/ti and Nudist, and show how they match these dimensions. I offer suggestions as to how to choose, and whether to use in tandem with complementary conceptual network software.},
journal = {Sociological research online},
author = {Christine A. Barry},
year = {1998},
keywords = {Approche statistique, Fouille de donnée},
howpublished = {http://www.socresonline.org.uk/3/3/4.html} },
-
H. Ahonen, O. Heinonen, M. Klemettinen, and I. A. Verkamo, "Applying data mining techniques in text analysis," , 1997.
@article{ahonen_applying_1997, title = {Applying data mining techniques in text analysis},
doi = {10.1.1.52.7756},
abstract = {A number of recent data mining techniques have been targeted especially for the analysis of sequential data. Traditional examples of sequential data involve telecommunication alarms, Www log files, user action registration for Hci studies, or any other series of events consisting of an event type and a time of occurrence. Text can also be seen as sequential data, in many respects similar to the data collected by sensors, or other observation systems. Traditionally, texts have been analysed using various information retrieval related methods, such as full-text analysis, and natural language processing. However, only few examples of data mining in text, particularly in full text, are available. In this paper we show that general data mining methods are applicable to text analysis tasks under certain conditions. Moreover, we present a general framework for text mining. The framework follows the general Kdd process, thus containing steps from preprocessing to the utilization of the results. The data mining method that we apply is based on generalized episodes and episode rules. We consider preprocessing of the text to be essential in text mining: by shifting the focus in the preprocessing phase, data mining can be used to obtain results for various purposes. We give concrete examples of how to preprocess texts based on the intended use of the discovered results and how to balance preprocessing with postprocessing. We also present example applications including search for key words, key phrases and other co-occurring words, e.g. collocations and generalized concordances. These applications are both common and relevant tasks in information retrieval and natural language processing. We also present results from real-life data experiments to show that our approach is applicable in practice.},
author = {Helena Ahonen and Oskari Heinonen and Mika Klemettinen and A. Inkeri Verkamo},
year = {1997},
keywords = {Analyse de texte, Fouille de donnée, Fouille de texte},
annote = {{{\textless}p{\textgreater}ahonenHelena1997.pdf{\textless}/p{\textgreater}}} },
-
K. Jim, J. Lai, and B. Wüthrich, "A data mining algorithm optimal for single rules," in Deductive and object-oriented databases : 5th international conference, DOOD’97 Montreux, Switzerland, December 8–12, 1997 : proceedings, Berlin; Heidelberg, 1997, pp. 368-385.
@inproceedings{jim_data_1997, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 1341},
title = {A data mining algorithm optimal for single rules},
isbn = {3-540-63792-3},
url = {http://portal.acm.org/citation.cfm?id=645347.650654&coll=GUIDE&dl=GUIDE},
abstract = {Today's rule mining algorithms all use greedy approaches to generate rules representing the knowledge hidden in vast amounts of data. When using a greedy approach, systems cannot guarantee that optimal rules are found. On the other hand, exhaustive search algorithms find optimal rules. But due to the vast search spaces, exhaustive search algorithms are in most cases impractically slow. This paper presents the A*-like rule mining algorithm {DA-2.} Similarly to exhaustive search algorithms, {DA-2} also finds optimal rules. Its running time, however, is just slightly longer than the running time of greedy algorithms.},
booktitle = {Deductive and object-oriented databases : 5th international conference, {DOOD'97} Montreux, Switzerland, December 8–12, 1997 : proceedings},
publisher = {{Springer-Verlag}},
author = {K. Jim and Jeffrey Lai and Beat Wüthrich},
year = {1997},
keywords = {Apprentissage machine, Fouille de donnée},
pages = {368--385},
annote = {{{\textless}p{\textgreater}jimK1997.pdf{\textless}/p{\textgreater}}} },
-
L. Torgo, Tools for comparative experiments using k-fold cross validation, 1996.
@misc{torgo_tools_1996, title = {Tools for comparative experiments using k-fold cross validation},
url = {http://www.liaad.up.pt/~ltorgo/Papers/CV_EVAL.ps.gz},
abstract = {This report describes some tools developed for making experimental evaluation/comparison of different methods on a given data set, using K-fold Cross Validation. These methods can either be different algorithms or different parameterizations of the same system (or both). We provide not only means to easily perform the tests but also tools for analyzing the results. The major goal of the development of these programs was system independence. This goal introduced some constraints on the tasks that could be abstracted as we will see.},
author = {Luís Torgo},
month = jul, year = {1996},
keywords = {Fouille de donnée, Méthodologie},
annote = {{{\textless}p{\textgreater}torgoLuis1996.pdf{\textless}/p{\textgreater}}} },
-
S. Wasserman and K. Faust, Social network analysis : methods and applications, New York: Cambridge University Press, 1994.
@book{wasserman_social_1994, address = {New York},
series = {Structural analysis in the social sciences},
title = {Social network analysis : methods and applications},
abstract = {Social network analysis is used widely in the social and behavioral sciences, as well as in economics, marketing, and industrial engineering. The social network perspective focuses on relationships among social entities and is an important addition to standard social and behavioral research, which is primarily concerned with attributes of the social units. Social Network Analysis: Methods and Applications reviews and discusses methods for the analysis of social networks with a focus on applications of these methods to many substantive examples. It is a reference book that can be used by those who want a comprehensive review of network methods, or by researchers who have gathered network data and want to find the most appropriate method by which to analyze it. It is also intended for use as a textbook as it is the first book to provide comprehensive coverage of the methodology and applications of the field.},
publisher = {Cambridge University Press},
author = {Stanley Wasserman and Katherine Faust},
year = {1994},
keywords = {Fouille de donnée} },
-
J. Scott, Social network analysis : a handbook, Newbuty Park, Calif.: Sage, 1991.
@book{scott_social_1991, address = {Newbuty Park, Calif.},
title = {Social network analysis : a handbook},
publisher = {Sage},
author = {John Scott},
year = {1991},
keywords = {Fouille de donnée} },
-
Jean-Paul, Analyses des données, I : La Taxinomie: Dunod, 1973.
@book{benzcri_analyses_1973, address = {I : La Taxinomie},
title = {Analyses des données},
publisher = {Dunod},
author = {{Jean-Paul} Benzécri},
year = {1973},
keywords = {Fouille de donnée} },
-
Y. Kodratoff, Machine learning and data mining.
@misc{kodratoff_machine_????, title = {Machine learning and data mining},
abstract = {Deep differences explain why Data Mining has been enthusiastically accepted by Industry, while Machine Learning and Exploratory Statistics still have problems being accepted by it. This paper points at all the epistemological, scientific, and industrial differences between the two, and explains why Data Mining is better accepted in Industry.},
author = {Yves Kodratoff},
keywords = {Apprentissage machine, Fouille de donnée},
annote = {{{\textless}p{\textgreater}kodratoffYves.doc{\textless}/p{\textgreater}}} },
-
L. Billard and E. Diday, Symbolic data analysis : definitions and examples.
@misc{billard_symbolic_????, title = {Symbolic data analysis : definitions and examples},
url = {http://220.149.236.232/zbxe/?module=file&act=procFileDownload&file_srl=1081&sid=3cb7c3f5b5081d90f992714d8efc75f7},
author = {L. Billard and E. Diday},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}billardL.pdf{\textless}/p{\textgreater}}} },
-
W. Kao, Web logs, text, and other data mining.
@misc{kao_web_????, type = {{PowerPoint}},
title = {Web logs, text, and other data mining},
author = {Wayne Kao},
keywords = {Fouille de donnée, Fouille de texte, Web},
annote = {{{\textless}p{\textgreater}kaoWayne.ppt{\textless}/p{\textgreater}}} },
-
Tu-Bao, Introduction to knowledge discovery and data mining.
@misc{ho_introduction_????, title = {Introduction to knowledge discovery and data mining},
author = {{Tu-Bao} Ho},
keywords = {Découverte de connaissances, Fouille de donnée},
annote = {{{\textless}p{\textgreater}hoTu-bao.doc{\textless}/p{\textgreater}}} },
-
K. Thearling, An introduction to data mining.
@misc{thearling_introduction_????, title = {An introduction to data mining},
author = {Kurt Thearling},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}thearlingKurt.pdf{\textless}/p{\textgreater}}} },
-
S. Parthasarathy, Introduction to data mining.
@misc{parthasarathy_introduction_????, type = {{PowerPoint}},
title = {Introduction to data mining},
author = {Srinivasan Parthasarathy},
keywords = {Fouille de donnée},
annote = {{{\textless}p{\textgreater}parthasarathySrinivasan.ppt{\textless}/p{\textgreater}}} },
-
A. Srivastava and M. Sahami, Text Mining: Classification, Clustering, and Applications, 1 ed., Chapman \& Hall/CRC, 2009.
@book{srivastava_text_2009, edition = {1},
title = {Text Mining: Classification, Clustering, and Applications},
isbn = {1420059408},
shorttitle = {Text Mining},
publisher = {Chapman \& {Hall/CRC}},
author = {Ashok Srivastava and Mehran Sahami},
month = jun, year = {2009},
keywords = {Classification, Cluster, Fouille de texte} },
-
J. Froelich and S. Ananyan, "Decision support via text mining," in Handbook on decision support systems 1, Berlin; Heidelberg, 2008, pp. 609-635.
@inproceedings{froelich_decision_2008, address = {Berlin; Heidelberg},
series = {International handbook on information systems},
title = {Decision support via text mining},
url = {http://dx.doi.org/10.1007/978-3-540-48713-5_28},
abstract = {The growing volume of textual data presents genuine, modern day challenges that traditional decision support systems, focused on quantitative data processing, are unable to address. The costs of competitive intelligence, customer experience metrics, and manufacturing controls are escalating as organizations are buried in piles of open-ended responses, news articles and documents. The emerging field of text mining is capable of transforming natural language into actionable results, acquiring new insight and managing information overload.},
booktitle = {Handbook on decision support systems 1},
publisher = {Springer},
author = {Josh Froelich and Sergei Ananyan},
year = {2008},
keywords = {Fouille de texte},
pages = {609--635} },
-
J. Diesner and K. Carley, "Conditional random fields for entity extraction and ontological text coding," Computational \& Mathematical Organization Theory, 2008.
@article{diesner_conditional_2008, title = {Conditional random fields for entity extraction and ontological text coding},
url = {http://dx.doi.org/10.1007/s10588-008-9029-z},
doi = {10.1007/s10588-008-9029-z},
abstract = {Abstract Previous research suggests that one field with a strong yet unsatisfied need for automatically extracting instances of various entity classes from texts is the analysis of socio-technical systems {(Feldstein} in Media in Transition {MiT5,} 2007; Hampe et al. in Netzwerkanalyse und Netzwerktheorie, 2007; Weil et al. in Proceedings of the 2006 Command and Control Research and Technology Symposium, 2006; Diesner and Carley in {XXV} Sunbelt Social Network Conference, 2005). Traditional as well as non-traditional and customized sets of entity classes and the relationships between them are often specified in ontologies or taxonomies. We present a Conditional Random Fields {(CRF)-based} approach to distilling a set of entities that are defined in an ontology originating from organization science. {CRF,} a supervised sequential machine learning technique, facilitates the derivation of relational data from corpora by locating and classifying instances of various entity classes. The classified entities can be used as nodes for the construction of socio-technical networks. We find the outcome sufficiently accurate (82.7 percent accuracy of locating and classifying entities) for future application in the described problem domain. We propose using the presented methodology as a crucial step in the process of advanced modeling and analysis of complex and dynamic networks.},
journal = {Computational \& Mathematical Organization Theory},
author = {Jana Diesner and Kathleen Carley},
year = {2008},
keywords = {Fouille de texte, Ontologie} },
-
Tsang-Hsiang and Chih-Ping, "A clustering-based approach for integrating document-category hierarchies," IEEE Transactions on systems, man \& cybernetics: part A, vol. 38, iss. 2, pp. 410-424, 2008.
@article{tsang-hsiang_cheng_clustering-based_2008, title = {A clustering-based approach for integrating document-category hierarchies},
volume = {38},
issn = {10834427},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4459764},
doi = {10.1109/TSMCA.2007.914758},
abstract = {E-commerce applications generate and consume a tremendous amount of online information, which is typically available as textual documents. Conceivably, organizations and individuals generally use category sets or hierarchies to organize, archive, and access their documents. Meanwhile, organizations and individuals constantly acquire relevant documents from various Internet sources, each of which may organize its documents in a category set or hierarchy different from that used by the acquiring organization or individual. Consequently, the integration of source documents organized in a category hierarchy into an existing category hierarchy deployed by the acquiring organization or individual becomes an important issue in the e-commerce era. Existing category-integration techniques are mainly designed to integrate document catalogs, each of which is organized nonhierarchically (i.e., in a flat set). In this paper, we propose a clustering-based category-hierarchy integration {(CHI)} technique, which is an extension of the clustering-based category- integration {(CCI)} technique. Our empirical evaluation results show that the proposed {CHI} technique appears to improve the effectiveness of category-hierarchy integration compared with that attained by nonhierarchical category-integration techniques, particularly in homogeneous and comparable scenarios. {ABSTRACT} {FROM} {AUTHOR} Copyright of {IEEE} Transactions on Systems, Man \& Cybernetics: Part A is the property of {IEEE} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {2},
journal = {{IEEE} Transactions on systems, man \& cybernetics: part A},
author = {{Tsang-Hsiang} Cheng and {Chih-Ping} Wei},
month = mar, year = {2008},
keywords = {Fouille de texte},
pages = {410--424},
annote = {{{\textless}p{\textgreater}Accession} Number: 31205346; {Tsang-Hsiang} Cheng 1; Email Address: cts@mail.stut.edu.tw; {Chih-Ping} Wei 2; Email Address: cpwei@mx.nthu.edu.tw; Affiliations: 1: Department of Business Administration, Southern Taiwan University, Tainan 710, Taiwan, {R.O.C;} 2: Institute of Technology Management, College of Technology Management, National Tsing Hua University, Hsinchu 300, Taiwan, {R.O.C;} Issue Info: Mar2008, Vol. 38 Issue 2, p410; Thesaurus Term: {CLUSTER} analysis; Thesaurus Term: {ELECTRONIC} commerce; Thesaurus Term: {ONLINE} information services; Thesaurus Term: {INTERNET;} Thesaurus Term: {DOCUMENT} management; Subject Term: {DOCUMENT} clustering; Subject Term: {HIERARCHIES;} Subject Term: {CATALOGS;} Subject Term: {TEXT} mining {(Information} retrieval); {Author-Supplied} Keyword: Category-hierarchy integration; {Author-Supplied} Keyword: document clustering; {Author-Supplied} Keyword: document management; {Author-Supplied} Keyword: document-category integration; {Author-Supplied} Keyword: taxonomy integration; {Author-Supplied} Keyword: text mining; {NAICS/Industry} Codes: 518111 Internet Service Providers; {NAICS/Industry} Codes: 425110 Business to Business Electronic Markets; {NAICS/Industry} Codes: 454111 Electronic Shopping; Number of Pages: 15p; Illustrations: 2 charts, 11 diagrams, 2 graphs; Document Type: Article{\textless}/p{\textgreater}} },
-
B. Yoon, R. Phaal, and D. Probert, "Morphology analysis for technology roadmapping: application of text mining.," R\&D Management, vol. 38, iss. 1, pp. 51-68, 2008.
@article{byungun_yoon_morphology_2008, title = {Morphology analysis for technology roadmapping: application of text mining.},
volume = {38},
issn = {00336807},
shorttitle = {Morphology analysis for technology roadmapping},
doi = {10.1111/j.1467-9310.2007.00493.x},
abstract = {The practice of technology roadmapping {(TRM)} has received much attention from researchers and practitioners, to support planning and forecasting in companies and sectors. However, little research has focused on the support of well-organized information for more effective roadmapping and the presentation of in-depth configurations of new products or technology. This paper proposes a roadmapping methodology to assist decision-making by applying a systematic approach based on quantitative data. To this end, key information is extracted from documents such as product manuals and patent documents by text mining, which is then used to identify the morphology of existing products and technology. Morphology analysis {(MA)} also plays a crucial role in deriving promising opportunities for new development of product or technology by matching product and technology morphology. Therefore, {MA-based} {TRM} can enable the effective exploitation of large quantities of significant information that might otherwise be left untapped, supporting innovation by generating a comprehensive set of detailed product and technology configurations. The proposed {MA-based} {TRM} approach can be applied to both incremental and radical innovation, supporting both market pull and technology push. The method is illustrated with a detailed example for mobile phones to demonstrate its practical application. {ABSTRACT} {FROM} {AUTHOR} Copyright of {R\&D} Management is the property of Blackwell Publishing Limited and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {1},
journal = {{R\&D} Management},
author = {Byungun Yoon and Rob Phaal and David Probert},
year = {2008},
keywords = {Fouille de texte},
pages = {51--68},
annote = {{{\textless}p{\textgreater}Accession} Number: 27974305; Byungun Yoon 1; Phaal, Rob 2; Email Address: rp108@eng.cam.ac.uk; Probert, David 2; Email Address: drp1001@cam.ac.uk; Affiliations: 1: Department of Industrial \& Systems Engineering, School of Engineering, Dongguk University, 3-26, Pil-dong 3ga, Chung-gu, Seoul, Korea; 2: Centre for Technology Management, Mill Lane, University of Cambridge, Cambridge, {UK;} Issue Info: Jan2008, Vol. 38 Issue 1, p51; Thesaurus Term: {NEW} products; Thesaurus Term: {QUALITY} of products; Thesaurus Term: {DECISION} making; Thesaurus Term: {TECHNOLOGY;} Thesaurus Term: {BUSINESS} forecasting; Thesaurus Term: {INFORMATION} resources; Thesaurus Term: {CELLULAR} telephones; Subject Term: {TEXT} mining {(Information} retrieval); {NAICS/Industry} Codes: 517212 Cellular and Other Wireless Telecommunications; Number of Pages: 18p; Document Type: Article{\textless}/p{\textgreater}} },
-
D. Thorleuchter, "Finding new technological ideas and inventions with text mining and technique philosophy," in Data analysis, machine learning and applications : proceedings of the 31st annual conference of the Gesellschaft für Klassifikation e.V., Albert-Ludwigs-Universität Freiburg, march 7–9, 2007, 2008, pp. 413-420.
@inproceedings{thorleuchter_finding_2008, series = {Studies in classification, data analysis, and knowledge organization},
title = {Finding new technological ideas and inventions with text mining and technique philosophy},
url = {http://dx.doi.org/10.1007/978-3-540-78246-9_49},
abstract = {Text mining refers generally to the process of deriving high quality information from unstructured texts. Unstructured texts come in many shapes and sizes. It may be stored in research papers, articles in technical periodicals, reports, documents, web pages etc. Here we introduce a new approach for finding textual patterns representing new technological ideas and inventions in unstructured technological texts. This text mining approach follows the statements of technique philosophy. Therefore a technological idea or invention represents not only a new mean, but a new purpose and mean combination. By systematic identification of the purposes, means and purpose-mean combinations in unstructured technological texts compared to specialized reference collections, a (semi-) automatic finding of ideas and inventions can be realized. Characteristics that are used to measure the quality of these patterns found in technological texts are comprehensibility and novelty to humans and usefulness for an application.},
booktitle = {Data analysis, machine learning and applications : proceedings of the 31st annual conference of the Gesellschaft für Klassifikation {e.V.,} {Albert-Ludwigs-Universität} Freiburg, march 7–9, 2007},
publisher = {Springer},
author = {Dirk Thorleuchter},
year = {2008},
keywords = {Fouille de texte, Philosophie},
pages = {413--420} },
-
G. Lappas, "An overview of web mining in societal benefit areas," Online Information Review, vol. 32, iss. 2, pp. 179-195, 2008.
@article{georgios_lappas_overview_2008, title = {An overview of web mining in societal benefit areas},
volume = {32},
issn = {1468-4527},
url = {http://www.emeraldinsight.com/10.1108/14684520810879818},
abstract = {Purpose – The focus of this paper is a survey of web-mining research related to areas of societal benefit. The article aims to focus particularly on web mining which may benefit societal areas by extracting new knowledge, providing support for decision making and empowering the effective management of societal issues. Design/methodology/approach – E-commerce and e-business are two fields that have been empowered by web mining, having many applications for increasing online sales and doing intelligent business. Have areas of social interest also been empowered by web mining applications? What are the current ongoing research and trends in e-services fields such as e-learning, e-government, e-politics and e-democracy? What other areas of social interest can benefit from web mining? This work will try to provide the answers by reviewing the literature for the applications and methods applied to the above fields. Findings – There is a growing interest in applications of web mining that are of social interest. This reveals that one of the current trends of web mining is toward the connection between intelligent web services and societal benefit applications, which denotes the need for interdisciplinary collaboration between researchers from various fields. Originality/value – On the one hand, this work presents to the web-mining community an overview of research opportunities in societal benefit areas. On the other hand, it presents to web researchers from various disciplines an approach for improving their web studies by considering web mining as a powerful research tool.},
number = {2},
journal = {Online Information Review},
author = {Georgios Lappas},
year = {2008},
keywords = {Fouille de texte, Web},
pages = {179 -- 195} },
-
F. Lin and Chia-Hao, "Storyline-based summarization for news topic retrospection," Decision Support Systems, vol. 45, iss. 3, pp. 473-490, 2008.
@article{lin_storyline-based_2008, title = {Storyline-based summarization for news topic retrospection},
volume = {45},
issn = {01679236},
doi = {10.1016/j.dss.2007.06.009},
abstract = {Electronics newspapers gradually become main sources for news readers. When facing the numerous reports on a series of events in a topic, a summary of stories from news reports will benefit news readers in reviewing the news topic efficiently. Besides identifying events and presenting news titles and keywords the {TDT} {(Topic} Detection and Tracking) techniques are used to do, a summarized text to present event evolution is necessary for general news readers to review events under a news topic. This paper proposes a topic retrospection process and implements the {SToRe} {(Story-line} based Topic Retrospection) system that identifies various events under a news topic, and composes a summary that news readers can get the sketch of event evolution in the topic. It consists of three main functions: event identification, main storyline construction and storyline-based summarization. The constructed main storyline can remove the irrelevant events and present a main theme. The storyline-based summarization extracts the representative sentences and takes the main theme as the template to compose the summary. The storyline summary not only provides readers enough information to understand the development of a news topic, but also serves as an index for readers to search corresponding news reports. Following a design science paradigm, a lab experiment is conducted to evaluate the {SToRe} system in the question-and-answer {(Q\&A)} setting. The experimental results show that {SToRe} enables news readers to effectively and efficiently capture the evolution of a news topic. Copyright 2008 Elsevier Copyright of Decision Support Systems is the property of Elsevier Science Publishers {B.V.} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {3},
journal = {Decision Support Systems},
author = {Fu-ren Lin and {Chia-Hao} Liang},
year = {2008},
keywords = {Fouille de texte, Recherche d'information},
pages = {473--490},
annote = {{{\textless}p{\textgreater}Accession} Number: 32638566; Lin, Fu-ren 1; Email Address: frlin@mx.nthu.edu.tw; Liang, {Chia-Hao} 2; Affiliations: 1: Institute of Technology Management National Tsing Hua University, Hsinchu City, Taiwan 300, {ROC;} 2: Department of Information Management National Sun Yat-sen University, Kaohsiung City, Taiwan 804, {ROC;} Issue Info: Jun2008, Vol. 45 Issue 3, p473; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {COMPUTER} programs; Thesaurus Term: {INFORMATION} services; Subject Term: {ELECTRONIC} newspapers; Subject Term: {INFORMATION} filtering systems; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {NEWS} audiences; Subject Term: {COMMUNICATION} \& technology; {NAICS/Industry} Codes: 519190 All Other Information Services; Number of Pages: 18p; Document Type: Article{\textless}/p{\textgreater}} },
-
G. G. Yen and Z. Wu, "Ranked centroid projection : a data visualization approach with self-organizing maps," IEEE Transactions on Neural Networks, vol. 19, iss. 2, pp. 245-259, 2008.
@article{yen_ranked_2008, title = {Ranked centroid projection : a data visualization approach with self-organizing maps},
volume = {19},
issn = {10459227},
shorttitle = {Ranked Centroid Projection},
doi = {10.1109/TNN.2007.905858},
abstract = {The self-organizing map {(SOM)} is an efficient tool for visualizing high-dimensional data. In this paper, the clustering and visualization capabilities of the {SOM,} especially in the analysis of textual data, i.e., document collections, are reviewed and further developed. A novel clustering and visualization approach based on the {SOM} is proposed for the task of text mining. The proposed approach first transforms the document space into a multidimensional vector space by means of document encoding. Afterwards, a growing hierarchical {SOM} {(GHSOM)} is trained and used as a baseline structure to automatically produce maps with various levels of detail. Following the {GHSOM} training, the new projection method, namely the ranked centroid projection {(RCP),} is applied to project the input vectors to a hierarchy of {2-D} output maps. The {RCP} is used as a data analysis tool as well as a direct interface to the data. In a set of simulations, the proposed approach is applied to an illustrative data set and two real-world scientific document collections to demonstrate its applicability. {ABSTRACT} {FROM} {AUTHOR} Copyright of {IEEE} Transactions on Neural Networks is the property of {IEEE} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {2},
journal = {{IEEE} Transactions on Neural Networks},
author = {Gary G. Yen and Zheng Wu},
month = feb, year = {2008},
keywords = {Fouille de donnée, Fouille de texte, Réseau de neurones, Visualisation de l'information},
pages = {245--259},
annote = {{{\textless}p{\textgreater}Accession} Number: 31171851; Yen, Gary G. 1; Email Address: gyen@okstate.edu; Zheng Wu 1; Affiliations: 1: School of Electrical and Computer Engineering, Oklahoma State University, Stillwater, {OK} 74078 {USA;} Issue Info: Feb2008, Vol. 19 Issue 2, p245; Thesaurus Term: {NEURAL} networks {(Computer} science); Thesaurus Term: {VISUAL} programming languages {(Computer} science); Thesaurus Term: {DATA} mining; Subject Term: {SELF-organizing} maps; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {CONTENT} mining; Subject Term: {SELF-organizing} systems; Subject Term: {VECTOR} analysis; Subject Term: {ENCODING;} {Author-Supplied} Keyword: Data visualization; {Author-Supplied} Keyword: document clustering; {Author-Supplied} Keyword: self-organizing map {(SOM);} Number of Pages: 15p; Illustrations: 3 charts, 6 diagrams, 15 graphs, 2 bw; Document Type: Article{\textless}/p{\textgreater}} },
-
L. Arco, R. Bello, Y. Caballero, and R. Falcón, "Rough text assisting text mining : focus on document clustering validity." Berlin; Heidelberg: Springer, 2008, pp. 229-248.
@incollection{arco_rough_2008, address = {Berlin; Heidelberg},
series = {Studies in fuzziness and soft computing; 224},
title = {Rough text assisting text mining : focus on document clustering validity},
isbn = {978-3-540-76972-9},
url = {http://dx.doi.org/10.1007/978-3-540-76973-6_15},
abstract = {In this chapter, the applications of rough set theory {(RST)} in text mining are discussed and a new concept named {“Rough} Text” is presented along with some {RST-based} measures for the evaluation of decision systems. We will focus on the application of such concept in clustering validity, specifically cluster labeling and multi-document summarization. The experimental studies show that the proposed measures outperform several internal measures existing on literature. Additionally, the application of Rough Text is illustrated.},
booktitle = {Granular computing : at the junction of rough sets and fuzzy sets},
publisher = {Springer},
author = {Leticia Arco and Rafael Bello and Yailé Caballero and Rafael Falcón},
year = {2008},
keywords = {Cluster, Fouille de texte},
pages = {229--248} },
-
A. Gonsalves, "Clarabridge improves text-analytics platform," Intelligent Enterprise, vol. 11, iss. 6, p. 6, 2008.
@article{gonsalves_clarabridge_2008, title = {Clarabridge improves text-analytics platform},
volume = {11},
issn = {15243621},
doi = {Article},
abstract = {The article reports on the upgrade of Clarabridge's text-analytics platform Content Mining Platform. It states that Release 3.0 of the Content Mining Platform includes Navigator, a drag-and-drop interface and its extraction engine has finer domain-specific tuning. It also provides the information of the enhancements done as well as the technology used in the upgrade of Release 3.0. Furthermore, it stresses that this version of the platform also offers collaboration tools for business analysts in different departments. An overview of the Gaylord Hotels, which is one of the customers of Clarabridge, is also presented.},
number = {6},
journal = {Intelligent Enterprise},
author = {Antone Gonsalves},
year = {2008},
keywords = {Catalogage, Fouille de donnée, Fouille de texte, Recherche d'information},
pages = {6},
annote = {{{\textless}p{\textgreater}Accession} Number: 33125974; Gonsalves, Antone; Issue Info: Jun2008, Vol. 11 Issue 6, p6; Thesaurus Term: {DATA} mining; Thesaurus Term: {DATABASE} searching; Thesaurus Term: {ONLINE} data processing; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {COMPUTER} programs; Subject Term: {SOFTWARE;} Subject Term: {CATALOGING} -- Analytical entry; Subject Term: {CONTENT} mining; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {AUTOMATIC} extracting {(Information} science) ; {Company/Entity:} {CLARABRIDGE} {(Company);} Number of Pages: 1p; Document Type: Article{\textless}/p{\textgreater}} },
-
Chih-Ping, C. C. Yang, and Chia-Min, "A Latent Semantic Indexing-based approach to multilingual document clustering," Decision Support Systems, vol. 45, iss. 3, pp. 606-620, 2008.
@article{wei_latent_2008, title = {A Latent Semantic Indexing-based approach to multilingual document clustering},
volume = {45},
issn = {01679236},
doi = {10.1016/j.dss.2007.07.008},
abstract = {The creation and deployment of knowledge repositories for managing, sharing, and reusing tacit knowledge within an organization has emerged as a prevalent approach in current knowledge management practices. A knowledge repository typically contains vast amounts of formal knowledge elements, which generally are available as documents. To facilitate users'' navigation of documents within a knowledge repository, knowledge maps, often created by document clustering techniques, represent an appealing and promising approach. Various document clustering techniques have been proposed in the literature, but most deal with monolingual documents (i.e., written in the same language). However, as a result of increased globalization and advances in Internet technology, an organization often maintains documents in different languages in its knowledge repositories, which necessitates multilingual document clustering {(MLDC)} to create organizational knowledge maps. Motivated by the significance of this demand, this study designs a Latent Semantic Indexing {(LSI)-based} {MLDC} technique capable of generating knowledge maps (i.e., document clusters) from multilingual documents. The empirical evaluation results show that the proposed {LSI-based} {MLDC} technique achieves satisfactory clustering effectiveness, measured by both cluster recall and cluster precision, and is capable of maintaining a good balance between monolingual and cross-lingual clustering effectiveness when clustering a multilingual document corpus. Copyright 2008 Elsevier Copyright of Decision Support Systems is the property of Elsevier Science Publishers {B.V.} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {3},
journal = {Decision Support Systems},
author = {{Chih-Ping} Wei and Christopher C. Yang and {Chia-Min} Lin},
year = {2008},
keywords = {Cluster, Fouille de texte, Indexation},
pages = {606--620},
annote = {{{\textless}p{\textgreater}Accession} Number: 32638577; Wei, {Chih-Ping} 1; Email Address: cpwei@mx.nthu.edu.tw; Yang, Christopher C. 2; Email Address: yang@se.cuhk.edu.hk; Lin, {Chia-Min} 3; Email Address: alucard.lin@gmail.com; Affiliations: 1: Institute of Technology Management, College of Technology Management, National Tsing Hua University, Hsinchu, Taiwan, {ROC;} 2: Department of Systems Engineering and Engineering Management, The Chinese University of Hong Kong, Shatin, {N.T.,} Hong Kong; 3: Openfind Information Technology Inc., {4F,} No. 222, Sec. 2, {Nan-Chang} Rd., Taipei, Taiwan, {ROC;} Issue Info: Jun2008, Vol. 45 Issue 3, p606; Thesaurus Term: {KNOWLEDGE} management; Thesaurus Term: {INFORMATION} services; Thesaurus Term: {MANAGEMENT;} Thesaurus Term: {INFORMATION} resources management; Subject Term: {DOCUMENT} clustering; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {CROSS-language} information retrieval; Subject Term: {COMPUTER} network resources; Subject Term: {INFORMATION} sharing; Subject Term: {AUTOMATIC} indexing; Number of Pages: 15p; Document Type: Article{\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}osswaldRainer2003.pdf{\textless}/p{\textgreater}}} },
-
Chen-Huei, A. Sinha, and H. Zhao, "A text mining approach to Internet abuse detection," Information Systems and E-Business Management, 2008.
@article{chou_text_2008, title = {A text mining approach to Internet abuse detection},
url = {http://dx.doi.org/10.1007/s10257-007-0070-0},
doi = {10.1007/s10257-007-0070-0},
abstract = {Abstract As the use of the Internet in organizations continues to grow, so does Internet abuse in the workplace. Internet abuse activities by employees-such as online chatting, gaming, investing, shopping, illegal downloading, pornography, and cybersex-and online crimes are inflicting severe costs to organizations in terms of productivity losses, resource wasting, security risks, and legal liabilities. Organizations have started to fight back via Internet usage policies, management training, and monitoring. Internet filtering software products are finding an increasing number of adoptions in organizations. These products mainly rely on blacklists, whitelists, and keyword/profile matching. In this paper, we propose a text mining approach to Internet abuse detection. We have empirically compared a variety of term weighting, feature selection, and classification techniques for Internet abuse detection in the workplace of software programmers. The experimental results are very promising; they demonstrate that the proposed approach would effectively complement the existing Internet filtering techniques.},
journal = {Information Systems and {E-Business} Management},
author = {{Chen-Huei} Chou and Atish Sinha and Huimin Zhao},
year = {2008},
keywords = {Fouille de texte} },
-
J. A. Ekstrom and G. T. Lau, "Exploratory text mining of ocean law to measure overlapping agency and jurisdictional authority," , Montreal, Canada, 2008, pp. 53-62.
@inproceedings{ekstrom_exploratory_2008, address = {Montreal, Canada},
title = {Exploratory text mining of ocean law to measure overlapping agency and jurisdictional authority},
isbn = {978-1-60558-099-9},
url = {http://portal.acm.org/ft_gateway.cfm?id=1367844&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {In this paper, we describe an innovative application of mining laws to identify and measure overlapping jurisdictions of government agencies. Laws (statutes and regulations) were used to represent ocean and coastal management for four geopolitical jurisdictions of federal and state levels {(Washington,} Oregon, and California). We developed preliminary metrics of overlap based on the number of statutes, regulations, and agencies associated with any given topic. The utility of these metrics was tested on 46 topics representing a range of activities and resources across ocean-related sectors within the geographic scope of laws investigated. We found the preliminary results of the overlaps metrics to reveal results similar to a recent review of federal ocean management. In addition, a network diagram graphical display of the data revealed multiple dimensions to facilitate interpretation of results.},
publisher = {Digital Government Society of North America},
author = {Julia A. Ekstrom and Gloria T. Lau},
year = {2008},
keywords = {Fouille de texte},
pages = {53--62},
annote = {{{\textless}p{\textgreater}ekstromJulia2008.pdf{\textless}/p{\textgreater}}} },
-
Y. Li, C. Luo, and S.M., "Text clustering with feature selection by using statistical data," Knowledge and Data Engineering, IEEE Transactions on, vol. 20, iss. 5, pp. 641-652, 2008.
@article{yanjun_li_text_2008, title = {Text clustering with feature selection by using statistical data},
volume = {20},
issn = {1041-4347},
url = {http://ieeexplore.ieee.org/Xplore/login.jsp?url=/iel5/69/28407/01269663.pdf?tp=&isnumber=28407&arnumber=1269663&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2004.1269663},
abstract = {Feature selection is an important method for improving the efficiency and accuracy of text categorization algorithms by removing redundant and irrelevant terms from the corpus. In this paper, we propose a new supervised feature selection method, named {CHIR,} which is based on the chi2 statistic and new statistical data that can measure the positive term-category dependency. We also propose a new text clustering algorithm, named text clustering with feature selection {(TCFS).} {TCFS} can incorporate {CHIR} to identify relevant features (i.e., terms) iteratively, and the clustering becomes a learning process. We compared {TCFS} and the K-means clustering algorithm in combination with different feature selection methods for various real data sets. Our experimental results show that {TCFS} with {CHIR} has better clustering accuracy in terms of the F-measure and the purity.},
number = {5},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Yanjun Li and Congnan Luo and {S.M.} Chung},
year = {2008},
keywords = {Approche statistique, Cluster, Fouille de texte},
pages = {641--652},
annote = {{{\textless}p{\textgreater}liYanjun2008.pdf{\textless}/p{\textgreater}}} },
-
D. Delen and MD, "Seeding the survey and analysis of research literature with text mining," Expert Systems with Applications, vol. 34, iss. 3, pp. 1707-1720, 2008.
@article{delen_seedingsurvey_2008, title = {Seeding the survey and analysis of research literature with text mining},
volume = {34},
issn = {0957-4174},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=24},
abstract = {Text mining is a semi-automated process of extracting knowledge from a large amount of unstructured data. Given that the amount of unstructured data being generated and stored is increasing rapidly, the need for automated means to process it is also increasing. In this study, we present, discuss and evaluate the techniques used to perform text mining on collections of textual information. A case study is presented using text mining to identify clusters and trends of related research topics from three major journals in the management information systems field. Based on the findings of this case study, it is proposed that this type of analysis could potentially be valuable for researchers in any field. {(C)} 2007 Elsevier Ltd. All rights reserved.},
number = {3},
journal = {Expert Systems with Applications},
author = {D Delen and {MD} Crossland},
month = apr, year = {2008},
keywords = {Catégorisation, Classification, Cluster, Extraction d'information, Fouille de texte},
pages = {1707--1720} },
-
Y. Ku, C. Chiu, Bo-Hong, Jyun-Hong, and Jheng-Ying, "Applying text mining to assist people who inquire HIV/AIDS information from Internet," in Intelligence and security informatics : IEEE ISI 2008 international workshops: PAISI, PACCF, and SOCO 2008, Taipei, Taiwan, june 17, 2008 : proceedings, Berlin; Heidelberg, 2008, pp. 440-448.
@inproceedings{ku_applying_2008, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 5075},
title = {Applying text mining to assist people who inquire {HIV/AIDS} information from Internet},
url = {http://dx.doi.org/10.1007/978-3-540-69304-8_46},
abstract = {Inquire health information from Internet or virtual community is one of hot activities on the web. But no one can guarantee the treatments or remedy work or not for the health questioners. The present research proposes an Internet health information governance mechanism {(IHIGM)} for support the diseases control and health authority to do their efforts in Internet health information. In the experiment, the research takes {“People} Inquire {HIV/AIDS} Information from Internet” as example and explains the procedure of {IHIGM.} In the experiment result, the accuracy ratio of {IHIGM} can at least classify 85\% positive {HIV/AIDS} Internet health information inquiry for intervention.},
booktitle = {Intelligence and security informatics : {IEEE} {ISI} 2008 international workshops: {PAISI,} {PACCF,} and {SOCO} 2008, Taipei, Taiwan, june 17, 2008 : proceedings},
publisher = {Springer},
author = {Yungchang Ku and Chaochang Chiu and {Bo-Hong} Liou and {Jyun-Hong} Liou and {Jheng-Ying} Wu},
year = {2008},
keywords = {Fouille de texte},
pages = {440--448} },
-
D. Henschen, "SPSS upgrades take predictive modeling and text mining mainstream," Intelligent Enterprise, vol. 11, iss. 1, p. 1, 2008.
@article{henschen_spss_2008, title = {{SPSS} upgrades take predictive modeling and text mining mainstream},
volume = {11},
issn = {15243621},
doi = {Article},
abstract = {The article reports on the launching of the Clementine 12.0 and Text Mining for Clementine 12.0 by {SPSS} Inc. The former extends automated modeling capabilities and the latter incorporates improved data visualization capabilities via a Graph Board. According to Richard Hren, {SPSS} director of product marketing, the firm has seen significant acceptance of its integration of text mining within its predictive analytics portfolio. The product is considered to take advantage of multithreading, multicore processors and load balancing.},
number = {1},
journal = {Intelligent Enterprise},
author = {Doug Henschen},
year = {2008},
keywords = {Business intelligence, Fouille de texte, Recherche d'information},
pages = {1},
annote = {{{\textless}p{\textgreater}Accession} Number: 29826075; Henschen, Doug; Issue Info: Jan2008, Vol. 11 Issue 1, p1; Thesaurus Term: {VISUAL} programming languages {(Computer} science); Thesaurus Term: {BUSINESS} intelligence; Thesaurus Term: {ONLINE} information services; Thesaurus Term: {APPLICATION} software; Thesaurus Term: {INFORMATION} technology; Thesaurus Term: {DATABASE} management; Thesaurus Term: {INFORMATION} retrieval; Subject Term: {DEVELOPMENT;} Subject Term: {ENTERPRISE} application integration {(Computer} systems); Subject Term: {TEXT} mining {(Information} retrieval) ; {Company/Entity:} {SPSS} Inc. {DUNS} Number: 030880488 Ticker: {SPSS;} Number of Pages: 1p; Document Type: Article{\textless}/p{\textgreater}} },
-
H. Kim and JY, "Exploring the emerging intellectual structure of archival studies using text mining : 2001-2004," Journal of Information Science, vol. 34, iss. 3, pp. 356-369, 2008.
@article{kim_exploringemerging_2008, title = {Exploring the emerging intellectual structure of archival studies using text mining : 2001-2004},
volume = {34},
issn = {0165-5515},
shorttitle = {Exploring the emerging intellectual structure of archival studies using text mining},
abstract = {Archival science, like other disciplines, is evolving into more specific interdisciplinary subfields. To determine this intellectual structure of archival science, the text mining method was used. The data were 432 articles from 2001 to 2004, and we produced 43 clusters of documents using the within-group average method in {SPSS.} Then we generated pathfinder networks of 43 clusters and grouped them into seven subject categories: digital libraries and digital archiving technologies, online resources and finding aids, archives and archivists, legal and political issues, electronic records and technical issues, records and information management, and e-mail and information professionals. Finally, these seven subject categories were merged into three sectors: digital library, archives and {RIM} {(Business).} This study describes dynamic change in the 2001-4 research themes from traditional single-subject areas to emerging, complex subject areas. These results also show that research areas in archival sciences have much growth potential and will continue to expand.},
number = {3},
journal = {Journal of Information Science},
author = {H Kim and {JY} Lee},
month = jun, year = {2008},
keywords = {Cluster, Fouille de texte},
pages = {356--369} },
-
C. Köse, &. Özyurt, and C. İkibaş, "A comparison of textual data mining methods for sex identification in Chat conversations," in Information retrieval technology : 4th Asia infomation retrieval symposium, AIRS 2008, Harbin, China, january 15-18, 2008 : revised selected papers, Berlin; Heidelberg, 2008, pp. 638-643.
@inproceedings{kse_comparison_2008, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4993},
title = {A comparison of textual data mining methods for sex identification in Chat conversations},
url = {http://dx.doi.org/10.1007/978-3-540-68636-1_76},
abstract = {Mining textual data in chat mediums is becoming more important because these mediums contain a vast amount of information, which is potentially relevant to a society’s current interests, habits, social behaviors, crime tendency and other tendencies. Here, sex identification is taken as a base study in information mining in chat mediums. In order to do this, a simple discrimination function and semantic analysis method are proposed for sex identification in Turkish chat mediums. Then, the proposed sex identification method is compared with the Support Vector Machine {(SVM)} and Naive Bayes {(NB)} methods. Finally, results show that the proposed system has achieved accuracy over 90\% in sex identification.},
booktitle = {Information retrieval technology : 4th Asia infomation retrieval symposium, {AIRS} 2008, Harbin, China, january 15-18, 2008 : revised selected papers},
publisher = {Springer},
author = {Cemal Köse and Özcan Özyurt and Cevat İkibaş},
year = {2008},
keywords = {Apprentissage machine, Fouille de texte, Recherche d'information},
pages = {638--643} },
-
M. Grcar, M. Grobelnik, and D. Mladenic, "Using text mining and link analysis for software mining," in Mining complex data : ECML/PKDD 2007 third international workshop, MCD 2007, Warsaw, Poland, september 17-21, 2007 : revised selected papers, Berlin; Heidelberg, 2008, pp. 1-12.
@inproceedings{grcar_using_2008, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4944},
title = {Using text mining and link analysis for software mining},
isbn = {978-3-540-68415-2},
url = {http://dx.doi.org/10.1007/978-3-540-68416-9_1},
abstract = {Many data mining techniques are these days in use for ontology learning – text mining, Web mining, graph mining, link analysis, relational data mining, and so on. In the current state-of-the-art bundle there is a lack of “software mining” techniques. This term denotes the process of extracting knowledge out of source code. In this paper we approach the software mining task with a combination of text mining and link analysis techniques. We discuss how each instance (i.e. a programming construct such as a class or a method) can be converted into a feature vector that combines the information about how the instance is interlinked with other instances, and the information about its (textual) content. The so-obtained feature vectors serve as the basis for the construction of the domain ontology with {OntoGen,} an existing system for semi-automatic data-driven ontology construction.},
booktitle = {Mining complex data : {ECML/PKDD} 2007 third international workshop, {MCD} 2007, Warsaw, Poland, september 17-21, 2007 : revised selected papers},
publisher = {Springer},
author = {Miha Grcar and Marko Grobelnik and Dunja Mladenic},
year = {2008},
keywords = {Fouille de texte},
pages = {1--12} },
-
M. Enkhsaikhan, W. Wong, W. Liu, and M. Reynolds, "Measuring data-driven ontology changes using text mining," , Gold Coast, Australia, 2007, pp. 39-46.
@inproceedings{enkhsaikhan_measuring_2007, address = {Gold Coast, Australia},
title = {Measuring data-driven ontology changes using text mining},
isbn = {978-1-920682-51-4},
url = {http://portal.acm.org/ft_gateway.cfm?id=1378252&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {Most current ontology management systems concentrate on detecting usage-driven changes and representing changes formally in order to maintain the consistency. In this paper, we present a semi-automatic approach for measuring and visualising data-driven changes through ontology learning. Terms are first generated using text mining techniques using an ontology learning module, and then classified automatically into clusters. The clusters are then manually named, which is the only manual process in this system. Each cluster is considered as a sub-concept of the root concept, and thus one dimension of the feature space describing the root concept. The changes of terms in each cluster contributes to the change of the root concept. Using our system, Web documents are collected at different time periods and fed into the system to generate different versions of the same ontology for each time period. The paper presents several ways of visualising and analysing the changes. Initial experiments on online media data have demonstrated the promising capabilities of our system.},
publisher = {Australian Computer Society, Inc.},
author = {Majigsuren Enkhsaikhan and Wilson Wong and Wei Liu and Mark Reynolds},
year = {2007},
keywords = {Fouille de texte, Ontologie},
pages = {39--46},
annote = {{{\textless}p{\textgreater}enkhsaikhanMajigsuren2007.pdf{\textless}/p{\textgreater}}} },
-
S. Burk, "An automated scoring system for measuring email emotion," Marketing Bulletin, vol. 18, pp. 1-12, 2007.
@article{burk_automated_2007, title = {An automated scoring system for measuring email emotion},
volume = {18},
issn = {01136895},
url = {http://search.ebscohost.com/login.aspx?direct=true&db=buh&AN=31793018&site=ehost-live},
doi = {Article},
abstract = {Recently there has been much interest in detecting emotional content in unstructured data by the machine learning and user interface communities. Research has been conducted in developing algorithms to detect emotional content in video and voice with additional work done in mining information from text. There are a number of application areas where interactive marketing activities can gain from mining emotion in text. In this paper we examine the application of an emotional scoring algorithm to Customer Relationship Management {(CRM)} activities, primarily in customer service operations. We examine the results of a pilot program with a large {U.S.} top 20 internet retailer to mine emotional content in email. A system was developed to determine an applicable score for individual emails. The system allows operations to more quickly and appropriately respond to irate or emotionally charged customers. We show that an effective algorithm can be developed inexpensively and on a brief project timeline. We provide some lessons learned and a basic architecture for the system. {ABSTRACT} {FROM} {AUTHOR} Copyright of Marketing Bulletin is the property of Massey University and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
journal = {Marketing Bulletin},
author = {Scott Burk},
month = may, year = {2007},
keywords = {Fouille de texte},
pages = {1--12},
annote = {{{\textless}p{\textgreater}Accession} Number: 31793018; Burk, Scott; Issue Info: May2007, Vol. 18, p1; Thesaurus Term: {ALGORITHMS;} Thesaurus Term: {EMOTIONS;} Thesaurus Term: {ELECTRONIC} mail messages; Thesaurus Term: {CUSTOMER} relationship management; Thesaurus Term: {CUSTOMER} services; Thesaurus Term: {INTERACTIVE} marketing; Thesaurus Term: {ELECTRONIC} commerce; Subject Term: {TEXT} mining {(Information} retrieval); Subject: {UNITED} States; {Author-Supplied} Keyword: Customer Relationship Management; {Author-Supplied} Keyword: Email; {Author-Supplied} Keyword: Scoring Algorithm; {Author-Supplied} Keyword: Text Mining; {NAICS/Industry} Codes: 425110 Business to Business Electronic Markets; {NAICS/Industry} Codes: 454111 Electronic Shopping; Number of Pages: 12p; Document Type: Article{\textless}/p{\textgreater}} },
-
H. Anaya-Sánchez, R. Berlanga-Llavori, and A. Pons-Porrata, "Retrieval of relevant concepts from a text collection," in Current topics in artificial intelligence : 12th conference of the Spanish association for artificial intelligence, CAEPIA 2007, Salamanca, Spain, november 12-16, 2007 : selected papers, Berlin; Heidelberg, 2007, pp. 21-30.
@inproceedings{anaya-snchez_retrieval_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4788},
title = {Retrieval of relevant concepts from a text collection},
url = {http://dx.doi.org/10.1007/978-3-540-75271-4_3},
abstract = {This paper addresses the characterization of a large text collection by introducing a method for retrieving sets of relevant {WordNet} concepts as descriptors of the collection contents. The method combines models for identifying interesting word co-occurrences with an extension of a word sense disambiguation algorithm in order to retrieve the concepts that better fit in with the collection topics. Multi-word nominal concepts that do not explicitly appear in the texts, can be found among the retrieved concepts. We evaluate our proposal using extensions of recall and precision that are also introduced in this paper.},
booktitle = {Current topics in artificial intelligence : 12th conference of the Spanish association for artificial intelligence, {CAEPIA} 2007, Salamanca, Spain, november 12-16, 2007 : selected papers},
publisher = {Springer},
author = {Henry {Anaya-Sánchez} and Rafael {Berlanga-Llavori} and Aurora {Pons-Porrata}},
year = {2007},
keywords = {Analyse de corpus, Fouille de texte, Recherche d'information},
pages = {21--30} },
-
E. Perez, "Managing the information explosion : with power text solutions’ text mining and summarization software," ONLINE, vol. 31, iss. 5, pp. 34-38, 2007.
@article{perez_managinginformation_2007, title = {Managing the information explosion : with power text solutions' text mining and summarization software},
volume = {31},
issn = {0146-5422},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=29},
number = {5},
journal = {{ONLINE}},
author = {E Perez},
month = oct, year = {2007},
keywords = {Fouille de texte},
pages = {34--38} },
-
R. Feldman and J. Sanger, The text mining handbook : advanced approaches in analyzing unstructured data, Cambridge: Cambridge University Press, 2007.
@book{feldman_text_2007-1, address = {Cambridge},
title = {The text mining handbook : advanced approaches in analyzing unstructured data},
isbn = {0521836573},
publisher = {Cambridge University Press},
author = {Ronen Feldman and James Sanger},
year = {2007},
keywords = {Fouille de texte} },
-
Y. Wu, H. Siy, M. Zand, and V. Winter, "Construction of ontology-based software repositories by text mining." Springer, 2007, pp. 790-797.
@incollection{wu_construction_2007, series = {Lecture notes in computer science; 4489},
title = {Construction of ontology-based software repositories by text mining},
url = {http://dx.doi.org/10.1007/978-3-540-72588-6_128},
abstract = {Software document repositories store artifacts produced in the course of developing software products. But most repositories are simply archives of documents. It is not unusual to find projects where different software artifacts are scattered in unrelated repositories with varying levels of granularity and without a centralized management system. This makes the information available in existing repositories difficult to reuse. In this paper, a methodology for constructing an ontology-based repository of reusable knowledge is presented. The information in the repository is extracted from specification documents using text mining. Ontologies are used to guide the extraction process and organize the extracted information. The methodology is being used to develop a repository of recurring and crosscutting aspects in software specification documents.},
booktitle = {Computational science – {ICCS} 2007 : 7th international conference, Beijing, China, may 27 - 30, 2007 : proceedings, part {III}},
publisher = {Springer},
author = {Yan Wu and Harvey Siy and Mansour Zand and Victor Winter},
year = {2007},
keywords = {Fouille de texte, Ontologie},
pages = {790--797} },
-
P. Zweigenbaum, D. Demner-Fushman, H. Yu, and K. B. Cohen, "Frontiers of biomedical text mining : current progress," Briefings in Bioinformatics, vol. 8, iss. 5, pp. 358-375, 2007.
@article{zweigenbaum_frontiers_2007, title = {Frontiers of biomedical text mining : current progress},
volume = {8},
issn = {14675463},
shorttitle = {Frontiers of biomedical text mining},
doi = {Article},
abstract = {It is now almost 15 years since the publication of the first paper on text mining in the genomics domain, and decades since the first paper on text mining in the medical domain. Enormous progress has been made in the areas of information retrieval, evaluation methodologies and resource construction. Some problems, such as abbreviation- handling, can essentially be considered solved problems, and others, such as identification of gene mentions in text, seem likely to be solved soon. However, a number of problems at the frontiers of biomedical text mining continue to present interesting challenges and opportunities for great improvements and interesting research. In this article we review the current state of the art in biomedical text mining or {‘BioNLP’} in general, focusing primarily on papers published within the past year. {ABSTRACT} {FROM} {AUTHOR} Copyright of Briefings in Bioinformatics is the property of Oxford University Press / {UK} and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {5},
journal = {Briefings in Bioinformatics},
author = {Pierre Zweigenbaum and Dina {Demner-Fushman} and Hong Yu and Kevin B. Cohen},
year = {2007},
keywords = {Extraction d'information, Fouille de texte, Recherche d'information},
pages = {358--375},
annote = {{{\textless}p{\textgreater}Accession} Number: 27743245; Zweigenbaum, Pierre; Email Address: pz@limsi.ft; {Demner-Fushman,} Dina; Hong Yu; Cohen, Kevin B.; Issue Info: Sep2007, Vol. 8 Issue 5, p358; Thesaurus Term: {DATA} mining; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {INFORMATION} resources; Thesaurus Term: {SEARCH} engines; Thesaurus Term: {INFORMATION} science; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {MEDICAL} sciences; Subject Term: {MOLECULAR} genetics; {Author-Supplied} Keyword: evaluation; {Author-Supplied} Keyword: image mining; {Author-Supplied} Keyword: in formation extraction; {Author-Supplied} Keyword: literature-based discovery; {Author-Supplied} Keyword: natural language processing; {Author-Supplied} Keyword: question answering; {Author-Supplied} Keyword: text mining; {Author-Supplied} Keyword: text summarization; {Author-Supplied} Keyword: user orientation; Number of Pages: 18p; Document Type: Article{\textless}/p{\textgreater}} },
-
M. Grobelnik, D. Mladenic, and F. Blaz, "Text mining and link analysis for web and semantic web," , San Jose, California, USA, 2007, pp. 1-1.
@inproceedings{grobelnik_text_2007, address = {San Jose, California, {USA}},
title = {Text mining and link analysis for web and semantic web},
isbn = {978-1-59593-609-7},
url = {http://portal.acm.org/citation.cfm?id=1281192.1327960&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
publisher = {{ACM}},
author = {Marko Grobelnik and Dunja Mladenic and Fortuna Blaz},
year = {2007},
keywords = {Fouille de texte, Web sémantique},
pages = {1--1} },
-
C. C. Trumbach and D. Payne, "Identifying synonymous concepts in preparation for technology mining," Journal of Information Science, vol. 33, iss. 6, pp. 660-677, 2007.
@article{trumbach_identifying_2007, title = {Identifying synonymous concepts in preparation for technology mining},
volume = {33},
issn = {01655515},
doi = {10.1177/0165551506076401},
abstract = {In this research, the development of a 'concept-clumping algorithm' designed to improve the clustering of technical concepts is demonstrated. The algorithm developed first identifies a list of technically relevant noun phrases from a cleaned extracted list and then applies a rule-based algorithm for identifying synonymous terms based on shared words in each term. An assessment of the algorithm found that the algorithm has an 89-91\% precision rate, was successful in moving technically important terms higher in the term frequency list, and improved the technical specificity of term clusters. {ABSTRACT} {FROM} {AUTHOR} Copyright of Journal of Information Science is the property of Sage Publications, Ltd. and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {6},
journal = {Journal of Information Science},
author = {Cherie Courseault Trumbach and Dinah Payne},
month = dec, year = {2007},
keywords = {Fouille de donnée, Fouille de texte},
pages = {660--677},
annote = {{{\textless}p{\textgreater}Accession} Number: 28141770; Trumbach, Cherie Courseault 1; Email Address: ctrumbac@uno.edu; Payne, Dinah 1; Affiliations: 1: Department of Management, University of New Orleans, New Orleans, {USA;} Issue Info: 2007, Vol. 33 Issue 6, p660; Thesaurus Term: {DATA} mining; Thesaurus Term: {DATABASE} searching; Thesaurus Term: {INFORMATION} resources; Thesaurus Term: {INFORMATION} science; Thesaurus Term: {DATABASES;} Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {BIBLIOMETRICS;} Subject Term: {STATISTICAL} methods; Subject Term: {COMPUTER} files; {Author-Supplied} Keyword: data quality; {Author-Supplied} Keyword: knowledge discovery; {Author-Supplied} Keyword: term similarity; {Author-Supplied} Keyword: text cleaning; {Author-Supplied} Keyword: text mining; Number of Pages: 18p; Illustrations: 10 charts, 1 diagram; Document Type: Article{\textless}/p{\textgreater}} },
-
. WEI, . . HU, . TAI, . HUANG, and . YANG, "Managing word mismatch problems in information retrieval : a topic-based query expansion approach," Journal of Management Information Systems, vol. 24, iss. 3, pp. 269-295, 2007.
@article{chih-ping_wei_managing_2007, title = {Managing word mismatch problems in information retrieval : a topic-based query expansion approach},
volume = {24},
issn = {07421222},
shorttitle = {Managing Word Mismatch Problems in Information Retrieval},
url = {http://search.ebscohost.com/login.aspx?direct=true&db=buh&AN=28857214&site=ehost-live},
doi = {Article},
abstract = {Word mismatch represents a fundamental information retrieval challenge that has become increasingly important as electronic document repositories (e.g., Web resources, digital libraries) grow in number and sheer volume. In general, word mismatch refers to the phenomenon in which a concept is described by different terms in user queries and in source documents. Query expansion represents a promising avenue to address such problems. Previous research predominantly approaches query expansion on the basis of global or local analysis. However, these approaches emphasize a global perspective rather than taking a topic-specific view of term associations. As a consequence, their effectiveness can be severely constrained when the document corpus spans a diverse set of topics. In this study, we propose a topic-based approach for query expansion and develop and empirically evaluate two novel methods--namely, nonfuzzy and fuzzy topic-based query expansion--to address word mismatch problems. According to our evaluation results, the proposed topic-based approach is more effective than a benchmark global analysis method, particularly when user queries consist of multiple query terms. {ABSTRACT} {FROM} {AUTHOR} Copyright of Journal of Management Information Systems is the property of {M.E.} Sharpe Inc. and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {3},
journal = {Journal of Management Information Systems},
author = {{CHIH-PING} {WEI} and {PAUL} {JEN-HWA} {HU} and {CHIA-HUNG} {TAI} and {CHUN-NENG} {HUANG} and {CHIN-SHENG} {YANG}},
year = {2007},
keywords = {Bibliothèque numérique, Cluster, Fouille de texte, Recherche d'information},
pages = {269--295},
annote = {{{\textless}p{\textgreater}Accession} Number: 28857214; {CHIH-PING} {WEI} 1; {JEN-HWA} {HU,} {PAUL} 2; {CHIA-HUNG} {TAI} 3; {CHUN-NENG} {HUANG} 4; {CHIN-SHENG} {YANG} 5; Affiliations: 1: Professor, Institute of Technology Management, National Tsing Hua University, Taiwan; 2: Associate Professor and David Eccles Faculty Fellow, David Eccles School of Business, University of Utah; 3: Research Assistant, Institute of Information Science, Academia Sinica, Taiwan; 4: Project Manager Engineer, Asiatek Taiwan; 5: Second Lieutenant, Republic of China Army; Issue Info: Winter2007/2008, Vol. 24 Issue 3, p269; Thesaurus Term: {INFORMATION} technology; Thesaurus Term: {INFORMATION} retrieval; Thesaurus Term: {MANAGEMENT;} Subject Term: {DOCUMENT} clustering; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {DIGITAL} libraries; {NAICS/Industry} Codes: 519120 Libraries and Archives; Number of Pages: 27p; Illustrations: 4 charts, 1 diagram, 3 graphs; Document Type: Article{\textless}/p{\textgreater}} },
-
R. Sanderson and P. Watry, "Integrating data and text mining processes for digital library applications," , Vancouver, BC, Canada, 2007, pp. 73-79.
@inproceedings{sanderson_integrating_2007, address = {Vancouver, {BC,} Canada},
title = {Integrating data and text mining processes for digital library applications},
isbn = {978-1-59593-644-8},
url = {http://portal.acm.org/ft_gateway.cfm?id=1255188&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1255175.1255188},
abstract = {This paper explores the integration of text mining and data mining techniques, digital library systems, and computational and data grid technologies with the objective of developing an online classification service exemplar. We discuss the current research issues relating to the use of data mining algorithms and toolkits for textual data; the necessary changes within the Cheshire3 Information Framework to accommodate analysis workflows; the outcomes of a demonstrator based on the National Library of Medicine's Medline dataset; and the provision of comparable metrics for evaluation purposes. The prototype has resulted in extremely accurate online classification services and offers a novel method of supporting text mining and data mining within a highly scaled computational environment, integrated seamlessly into the digital library architecture.},
publisher = {{ACM}},
author = {Robert Sanderson and Paul Watry},
year = {2007},
keywords = {Bibliothèque numérique, Fouille de donnée, Fouille de texte},
pages = {73--79},
annote = {{{\textless}p{\textgreater}sandersonRobert2007.pdf{\textless}/p{\textgreater}}} },
-
P. de Castro, F. de França, H. Ferreira, and F. V. Zuben, "Applying biclustering to text mining : an immune-inspired approach," in Artificial immune systems : 6th international conference, ICARIS 2007, Santos, Brazil, august 26-29, 2007 : proceedings, Berlin; Heidelberg, 2007, pp. 83-94.
@inproceedings{de_castro_applying_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4628},
title = {Applying biclustering to text mining : an immune-inspired approach},
url = {http://dx.doi.org/10.1007/978-3-540-73922-7_8},
abstract = {With the rapid development of information technology, computers are proving to be a fundamental tool for the organization and classification of electronic texts, given the huge amount of available information. The existent methodologies for text mining apply standard clustering algorithms to group similar texts. However, these algorithms generally take into account only the global similarities between the texts and assign each one to only one cluster, limiting the amount of information that can be extracted from the texts. An alternative proposal capable of solving these drawbacks is the biclustering technique. The biclustering is able to perform clustering of rows and columns simultaneously, allowing a more comprehensive analysis of the texts. The main contribution of this paper is the development of an immune-inspired biclustering algorithm to carry out text mining, denoted {BIC-aiNet.} {BIC-aiNet} interprets the biclustering problem as several two-way bipartition problems, instead of considering a single two-way permutation framework. The experimental results indicate that our proposal is able to group similar texts efficiently and extract implicit useful information from groups of texts.},
booktitle = {Artificial immune systems : 6th international conference, {ICARIS} 2007, Santos, Brazil, august 26-29, 2007 : proceedings},
publisher = {Springer},
author = {Pablo de Castro and Fabrício de França and Hamilton Ferreira and Fernando Von Zuben},
year = {2007},
keywords = {Fouille de texte},
pages = {83--94} },
-
I. Sato and H. Nakagawa, "Semi-structure mining method for text mining with a chunk-based dependency structure," in Advances in knowledge discovery and data mining : 11th Pacific-Asia conference, PAKDD 2007, Nanjing, China, may 22-25, 2007 : proceedings, Berlin; Heidelberg, 2007, pp. 777-784.
@inproceedings{sato_semi-structure_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4426},
title = {Semi-structure mining method for text mining with a chunk-based dependency structure},
url = {http://dx.doi.org/10.1007/978-3-540-71701-0_85},
abstract = {In text mining, when we need more precise information than word frequencies such as the relationships among words, it is necessary to extract frequent patterns of words with a dependency structure in a sentence. This paper proposes a semi-structure mining method for extracting frequent patterns of words with a dependency structure from a text corpus. First, it describes the data structure representing the dependency structure. This is a tree structure in which each node has multiple items. Then, a mining algorithm for this data structure is described. Our method can extract frequent patterns that cannot be extracted by conventional methods.},
booktitle = {Advances in knowledge discovery and data mining : 11th {Pacific-Asia} conference, {PAKDD} 2007, Nanjing, China, may 22-25, 2007 : proceedings},
publisher = {Springer},
author = {Issei Sato and Hiroshi Nakagawa},
year = {2007},
keywords = {Fouille de texte},
pages = {777--784} },
-
E. Meglio, M. G. Grassia, and M. Misuraca, "The ideal candidate : analysis of professional competences through text mining of job offers." Heidelberg: Physica-Verlag, 2007, pp. 261-275.
@incollection{meglio_ideal_2007, address = {Heidelberg},
title = {The ideal candidate : analysis of professional competences through text mining of job offers},
isbn = {978-3-7908-1749-2},
url = {http://dx.doi.org/10.1007/978-3-7908-1751-5_19},
abstract = {The aim of this paper is to propose analytical tools for identifying peculiar aspects of the job market for graduates. The main objective is to reduce the complexity of the phenomenon, both on the variable side, by transforming the collected information into latent factors, and on the unit side, by classifying observations. We propose a strategy for dealing with data that have different source and nature. The dependence structure is investigated to identify potential evolutionary paths. Moreover, symbolic objects and their graphical representation are used for identifying the peculiar characteristics required by companies operating in different economic sectors.},
booktitle = {Effectiveness of university education in Italy : employability, competences, human capital},
publisher = {{Physica-Verlag}},
author = {Emilio Meglio and Maria Gabriella Grassia and Michelangelo Misuraca},
year = {2007},
keywords = {Fouille de texte},
pages = {261--275} },
-
F. Ibekwe-SanJuan, Fouille de textes : méthodes, outils et applications, , 2007.
@book{ibekwe-sanjuan_fouille_2007-1, title = {Fouille de textes : méthodes, outils et applications},
author = {Fidelia {Ibekwe-SanJuan}},
year = {2007},
keywords = {Fouille de texte} },
-
A. Pons-Porrata, R. Berlanga-Llavori, and J. Ruiz-Shulcloper, "Topic discovery based on text mining techniques," Information Processing \& Management, vol. 43, iss. 3, pp. 752-768, 2007.
@article{pons-porrata_topic_2007, title = {Topic discovery based on text mining techniques},
volume = {43},
issn = {0306-4573},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=38},
abstract = {In this paper, we present a topic discovery system aimed to reveal the implicit knowledge present in news streams. This knowledge is expressed as a hierarchy of topic/subtopics, where each topic contains the set of documents that are related to it and a summary extracted from these documents. Summaries so built are useful to browse and select topics of interest from the generated hierarchies. Our proposal consists of a new incremental hierarchical clustering algorithm, which combines both partitional and agglomerative approaches, taking the main benefits from them. Finally, a new summarization method based on Testor Theory has been proposed to build the topic summaries. Experimental results in the {TDT2} collection demonstrate its usefulness and effectiveness not only as a topic detection system, but also as a classification and summarization tool. (c) 2006 Elsevier Ltd. All rights reserved.},
number = {3},
journal = {Information Processing \& Management},
author = {A {Pons-Porrata} and R {Berlanga-Llavori} and J {Ruiz-Shulcloper}},
month = may, year = {2007},
keywords = {Cluster, Fouille de texte},
pages = {752--768} },
-
J. Atkinson, "Evolving explanatory novel patterns for semantically-based text mining." London: Springer, 2007, pp. 145-169.
@incollection{atkinson_evolving_2007, address = {London},
title = {Evolving explanatory novel patterns for semantically-based text mining},
url = {http://dx.doi.org/10.1007/978-1-84628-754-1_9},
abstract = {An important problem with mining textual information is that in this unstructured form is not readily accessible to be used by computers. This has been written for human readers and requires, when feasible, some natural language interpretation. Although full processing is still out of reach with current technology, there are tools using basic pattern recognition techniques and heuristics that are capable of extracting valuable information from free text based on the elements contained in it (i.e., keywords). This technology is usually referred to as Text Mining, and aims at discovering unseen and interesting patterns in textual databases [8], [19].},
booktitle = {Natural language processing and text mining},
publisher = {Springer},
author = {John Atkinson},
year = {2007},
keywords = {Fouille de texte},
pages = {145--169} },
-
JD and SM, "Parallel mining of association rules from text databases," Journal of Supercomputing, vol. 39, iss. 3, pp. 273-299, 2007.
@article{holt_parallel_2007, title = {Parallel mining of association rules from text databases},
volume = {39},
issn = {0920-8542},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=37},
abstract = {In this paper, we propose a new algorithm named Parallel Multipass with Inverted Hashing and Pruning {(PMIHP)} for mining association rules between words in text databases. The characteristics of text databases are quite different from those of retail transaction databases, and existing mining algorithms cannot handle text databases efficiently because of the large number of itemsets (i.e., sets of words) that need to be counted. The new {PMIHP} algorithm is a parallel version of our Multipass with Inverted Hashing and Pruning {(MIHP)} algorithm {(Holt,} Chung in: Proc of the 14th {IEEE} int'l conf on tools with artificial intelligence, 2002, pp 49-56), which was shown to be quite efficient than other existing algorithms in the context of mining text databases. The {PMIHP} algorithm reduces the overhead of communication between miners running on different processors because they are mining local databases asynchronously and prune the global candidates by using the Inverted Hashing and Pruning technique. Compared with the well-known Count Distribution algorithm {(Agrawal,} Shafer in: (1996) {IEEE} Trans Knowl Data Eng 8(6):962-969), {PMIHP} demonstrates superior performance characteristics for mining association rules in large text databases, and when the minimum support level is low, its speedup is superlinear as the number of processors increases. These experiments were performed on a cluster of Linux workstations using a collection of Wall Street Journal articles.},
number = {3},
journal = {Journal of Supercomputing},
author = {{JD} Holt and {SM} Chung},
month = mar, year = {2007},
keywords = {Fouille de texte},
pages = {273--299} },
-
A. Don, E. Zheleva, M. Gregory, S. Tarkan, L. Auvil, T. Clement, B. Shneiderman, and C. Plaisant, "Discovering interesting usage patterns in text collections: integrating text mining with visualization," in Proceedings of the sixteenth ACM conference on Conference on information and knowledge management, Lisbon, Portugal, 2007, pp. 213-222.
@inproceedings{don_discovering_2007, address = {Lisbon, Portugal},
title = {Discovering interesting usage patterns in text collections: integrating text mining with visualization},
isbn = {978-1-59593-803-9},
shorttitle = {Discovering interesting usage patterns in text collections},
url = {http://portal.acm.org/citation.cfm?id=1321473},
doi = {10.1145/1321440.1321473},
abstract = {This paper addresses the problem of making text mining results more comprehensible to humanities scholars, journalists, intelligence analysts, and other researchers, in order to support the analysis of text collections. Our system, {FeatureLens1,} visualizes a text collection at several levels of granularity and enables users to explore interesting text patterns. The current implementation focuses on frequent itemsets of n-grams, as they capture the repetition of exact or similar expressions in the collection. Users can find meaningful co-occurrences of text patterns by visualizing them within and across documents in the collection. This also permits users to identify the temporal evolution of usage such as increasing, decreasing or sudden appearance of text patterns. The interface could be used to explore other text features as well. Initial studies suggest that {FeatureLens} helped a literary scholar and 8 users generate new hypotheses and interesting insights using 2 text collections.},
booktitle = {Proceedings of the sixteenth {ACM} conference on Conference on information and knowledge management},
publisher = {{ACM}},
author = {Anthony Don and Elena Zheleva and Machon Gregory and Sureyya Tarkan and Loretta Auvil and Tanya Clement and Ben Shneiderman and Catherine Plaisant},
year = {2007},
keywords = {Fouille de texte, Visualisation de l'information},
pages = {213--222},
annote = {{{\textless}p{\textgreater}anthonyDon2007.pdf{\textless}/p{\textgreater}}} },
-
R. Feldman and J. Sanger, The text mining handbook : advanced approaches in analyzing unstructured data, Cambridge ; New York: Cambridge University Press, 2007.
@book{feldman_text_2007, address = {Cambridge ; New York},
title = {The text mining handbook : advanced approaches in analyzing unstructured data},
isbn = {0521836573 {(HARDBACK)} 9780521836579 {(HARDBACK)}},
url = {http://library.books24x7.com/toc.asp?bookid=23164},
publisher = {Cambridge University Press},
author = {Ronen Feldman and James Sanger},
year = {2007},
keywords = {Fouille de texte},
annote = {{{\textless}p{\textgreater}Accessible} en ligne via Books24x7 (http://library.books24x7.com/toc.asp?bookid=23164){\textless}/p{\textgreater}},
annote = {{{\textless}p{\textgreater}TOC} : Introduction to text mining -- Core text mining operations -- Text mining preprocessing techniques -- Categorization -- Clustering -- Information extraction -- Probabilistic models for information extraction -- Preprocessing applications using probabilistic and hybrid approaches -- Presentation-layer considerations for browsing and query refinement -- Visualization approaches -- Link analysis -- Text mining applications.{\textless}/p{\textgreater}} },
-
Yuen-Hsien, Chi-Jen, and Y. I. Lin, "Text mining techniques for patent analysis," Information Processing \& Management, vol. 43, iss. 5, pp. 1216-1247, 2007.
@article{tseng_text_2007, title = {Text mining techniques for patent analysis},
volume = {43},
url = {http://www.sciencedirect.com/science/article/B6VC8-4MX54T9-4/2/911031b1e2680af0ef27c7afc4dd590d},
abstract = {Patent documents contain important research results. However, they are lengthy and rich in technical terminology such that it takes a lot of human efforts for analyses. Automatic tools for assisting patent engineers or decision makers in patent analysis are in great demand. This paper describes a series of text mining techniques that conforms to the analytical process used by patent analysts. These techniques include text segmentation, summary extraction, feature selection, term association, cluster generation, topic identification, and information mapping. The issues of efficiency and effectiveness are considered in the design of these techniques. Some important features of the proposed methodology include a rigorous approach to verify the usefulness of segment extracts as the document surrogates, a corpus- and dictionary-free algorithm for keyphrase extraction, an efficient co-word analysis method that can be applied to large volume of patents, and an automatic procedure to create generic cluster titles for ease of result interpretation. Evaluation of these techniques was conducted. The results confirm that the machine-generated summaries do preserve more important content words than some other sections for classification. To demonstrate the feasibility, the proposed methodology was applied to a real-world patent set for domain analysis and mapping, which shows that our approach is more effective than existing classification systems. The attempt in this paper to automate the whole process not only helps create final patent maps for topic analyses, but also facilitates or improves other patent analysis tasks such as patent classification, organization, knowledge sharing, and prior art searches.},
number = {5},
journal = {Information Processing \& Management},
author = {{Yuen-Hsien} Tseng and {Chi-Jen} Lin and Yu- I. Lin},
year = {2007},
keywords = {Catégorisation, Cluster, Fouille de texte},
pages = {1216--1247} },
-
Y. Yamamoto and T. Takagi, "Biomedical knowledge navigation by literature clustering," Journal of Biomedical Informatics, vol. 40, iss. 2, pp. 114-130, 2007.
@article{yamamoto_biomedical_2007, title = {Biomedical knowledge navigation by literature clustering},
volume = {40},
url = {http://www.sciencedirect.com/science/article/B6WHD-4KK2R71-1/2/73f35c127cab2d3813f8f321ca56da1a},
abstract = {There is an urgent need for a system that facilitates surveys by biomedical researchers and the subsequent formulation of hypotheses based on the knowledge stored in literature. One approach is to cluster papers discussing a topic of interest and reveal its sub-topics that allow researchers to acquire an overview of the topic. We developed such a system called {McSyBi.} It accepts a set of citation data retrieved with {PubMed} and hierarchically and non-hierarchically clusters them based on the titles and the abstracts using statistical and natural language processing methods. A novel point is that {McSyBi} allows its users to change the clustering by entering a {MeSH} term or {UMLS} Semantic Type, and therefore they can see a set of citation data from multiple aspects. We evaluated {McSyBi} quantitatively and qualitatively: clustering of 27 sets of citation data (40643 different papers) and scrutiny of several resultant clusters. While non-hierarchical clustering provides us with an overview of the target topic, hierarchical clustering allows us to see more details and relationships among citation data. {McSyBi} is freely available at {http://textlens.hgc.jp/McSyBi/.}},
number = {2},
journal = {Journal of Biomedical Informatics},
author = {Yasunori Yamamoto and Toshihisa Takagi},
year = {2007},
keywords = {Cluster, Fouille de texte},
pages = {114--130} },
-
A. Juárez-González, A. Téllez-Valero, C. Denicia-Carral, M. Montes-y-Gómez, and L. Villaseñor-Pineda, "Using machine learning and text mining in question answering," in Evaluation of multilingual and multi-modal information retrieval : 7th workshop of the cross-language evaluation forum, CLEF 2006, Alicante, Spain, september 20-22, 2006 : revised selected papers, Berlin; Heidelberg, 2007, pp. 415-423.
@inproceedings{jurez-gonzlez_using_2007, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4730},
title = {Using machine learning and text mining in question answering},
url = {http://dx.doi.org/10.1007/978-3-540-74999-8_49},
abstract = {This paper describes a {QA} system centered in a full data-driven architecture. It applies machine learning and text mining techniques to identify the most probable answers to factoid and definition questions respectively. Its major quality is that it mainly relies on the use of lexical information and avoids applying any complex language processing resources such as named entity classifiers, parsers and ontologies. Experimental results on the Spanish Question Answering task at {CLEF} 2006 show that the proposed architecture can be a practical solution for monolingual question answering by reaching a precision as high as 51\%.},
booktitle = {Evaluation of multilingual and multi-modal information retrieval : 7th workshop of the cross-language evaluation forum, {CLEF} 2006, Alicante, Spain, september 20-22, 2006 : revised selected papers},
publisher = {Springer},
author = {Antonio {Juárez-González} and Alberto {Téllez-Valero} and Claudia {Denicia-Carral} and Manuel {Montes-y-Gómez} and Luis {Villaseñor-Pineda}},
year = {2007},
keywords = {Apprentissage machine, Fouille de texte},
pages = {415--423} },
-
V. Khoroshevsky, I. Efimenko, G. Drobyazko, P. Kananykina, V. Klintsov, D. Lisitsin, V. Seledkin, A. Starostin, and V. Vorobyov, "Ontos solutions for semantic Web : text mining, navigation and analytics." Berliln; Heidelberg: Springer, 2007, pp. 11-27.
@incollection{khoroshevsky_ontos_2007, address = {Berliln; Heidelberg},
series = {Lecture notes in computer sciences; 4476},
title = {Ontos solutions for semantic Web : text mining, navigation and analytics},
url = {http://dx.doi.org/10.1007/978-3-540-72839-9_2},
abstract = {This paper deals with the problem of development and implementation of semantic navigation through Web-content. Multi-agent architecture of a solution for Semantic Web and innovative services are presented. In the context of the proposed solution Web mining is carried out by special {OntosMiner} agents, which provide the ontology-driven processing of multilingual text collections on the basis of the special kind of content extraction technologies. First evaluation results of the presented solution are discussed as well.},
booktitle = {Autonomous intelligent systems : multi-agents and data mining},
publisher = {Springer},
author = {Vladimir Khoroshevsky and Irina Efimenko and Grigory Drobyazko and Polina Kananykina and Victor Klintsov and Dmitry Lisitsin and Viacheslav Seledkin and Anatoli Starostin and Vyacheslav Vorobyov},
year = {2007},
keywords = {Fouille de texte, Web sémantique},
pages = {11--27},
annote = {{{\textless}p{\textgreater}Accessible} en ligne via Springer (frais) (http://www.springerlink.com/content/d05up5m2g6547803/){\textless}/p{\textgreater}} },
-
F. Ibekwe-SanJuan, Fouille de textes : méthodes, outils et applications, Paris: Herme?s science publications, 2007.
@book{ibekwe-sanjuan_fouille_2007, address = {Paris},
title = {Fouille de textes : méthodes, outils et applications},
isbn = {9782746216099},
publisher = {Herme?s science publications},
author = {Fidelia {Ibekwe-SanJuan}},
year = {2007},
keywords = {Fouille de texte} },
-
M. Bramer, "Text mining." London: Springer, 2007, pp. 239-253.
@incollection{bramer_text_2007, address = {London},
title = {Text mining},
url = {http://dx.doi.org/10.1007/978-1-84628-766-4_15},
booktitle = {Principles of data mining},
publisher = {Springer},
author = {Max Bramer},
year = {2007},
keywords = {Fouille de texte},
pages = {239--253} },
-
C. Silva and B. Ribeiro, "On text-based mining with active learning and background knowledge using SVM," Soft Computing, vol. 11, iss. 6, pp. 519-530, 2007.
@article{silva_text-based_2007, title = {On text-based mining with active learning and background knowledge using {SVM}},
volume = {11},
issn = {1432-7643},
url = {http://apps.isiknowledge.com/full_record.do?product=WOS&search_mode=CitationReport&qid=4&SID=1EOnkFI4kknf@49oNkC&page=1&doc=39},
abstract = {Text mining, intelligent text analysis, text data mining and knowledge-discovery in text are generally used aliases to the process of extracting relevant and non-trivial information from text. Some crucial issues arise when trying to solve this problem, such as document representation and deficit of labeled data. This paper addresses these problems by introducing information from unlabeled documents in the training set, using the support vector machine {(SVM)} separating margin as the differentiating factor. Besides studying the influence of several pre-processing methods and concluding on their relative significance, we also evaluate the benefits of introducing background knowledge in a {SVM} text classifier. We further evaluate the possibility of actively learning and propose a method for successfully combining background knowledge and active learning. Experimental results show that the proposed techniques, when used alone or combined, present a considerable improvement in classification performance, even when small labeled training sets are available.},
number = {6},
journal = {Soft Computing},
author = {C Silva and B Ribeiro},
month = apr, year = {2007},
keywords = {Fouille de texte},
pages = {519--530} },
-
T. Lee, "Constraint-based ontology induction from online customer reviews," Group Decision and Negotiation, vol. 16, pp. 255-281, 2007.
@article{lee_constraint-based_2007, title = {Constraint-based ontology induction from online customer reviews},
volume = {16},
url = {http://www.ingentaconnect.com/content/klu/grup/2007/00000016/00000003/00009065},
abstract = {We present an unsupervised, domain-independent technique for inducing a product-specific ontology of product features based upon online customer reviews. We frame ontology induction as a logical assignment problem and solve it with a bounds consistency constrained logic program. Using shallow natural language processing techniques, reviews are parsed into phrase sequences where each phrase refers to a single concept. Traditional document clustering techniques are adapted to collect phrases into initial concepts. We generate a token graph for each initial concept cluster and find a maximal clique to define the corresponding logical set of concept sub-elements. The logic program assigns tokens to clique sub-elements. We apply the technique to several thousand digital camera customer reviews and evaluate the results by comparing them to the ontologies represented by several prominent online buying guides. Because our results are drawn directly from customer comments, differences between our automatically induced product features and those in extant guides may reflect opportunities for better managing customer-producer relationships rather than errors in the process.},
journal = {Group Decision and Negotiation},
author = {Thomas Lee},
year = {2007},
keywords = {Analyse de texte, Fouille de texte, Ontologie},
pages = {255--281} },
-
A. Stavrianou, P. Andritsos, and N. Nicoloyannis, "Overview and semantic issues of text mining," ACM SIGMOD Record, vol. 36, iss. 3, pp. 23-34, 2007.
@article{stavrianou_overview_2007, title = {Overview and semantic issues of text mining},
volume = {36},
url = {http://portal.acm.org/ft_gateway.cfm?id=1324190&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1324185.1324190},
abstract = {Text mining refers to the discovery of previously unknown knowledge that can be found in text collections. In recent years, the text mining field has received great attention due to the abundance of textual data. A researcher in this area is requested to cope with issues originating from the natural language particularities. This survey discusses such semantic issues along with the approaches and methodologies proposed in the existing literature. It covers syntactic matters, tokenization concerns and it focuses on the different text representation techniques, categorisation tasks and similarity measures suggested.},
number = {3},
journal = {{ACM} {SIGMOD} Record},
author = {Anna Stavrianou and Periklis Andritsos and Nicolas Nicoloyannis},
year = {2007},
keywords = {Fouille de texte},
pages = {23--34},
annote = {{{\textless}p{\textgreater}stavrianouAnna2007.pdf{\textless}/p{\textgreater}}} },
-
O. K. Fivelstad, "Temporal text mining : the TTM testbench," PhD Thesis , 2007.
@phdthesis{ole_kristian_fivelstad_temporal_2007, type = {Master of Science in Computer Science},
title = {Temporal text mining : the {TTM} testbench},
abstract = {This master thesis presents the Temporal Text Mining {(TTM)} Testbench, an application for discovering association rules in temporal document collections. It is a ontinuation of work done in a project the fall of 2005 and the work done in a project the fall of 2006. These projects have laid the foundation for this thesis. The focus of the work is on identifying and extracting meaningful terms from textual documents to improve the meaningfulness of the mined association rules. Much work has been done to compile the theoretical foundation of this project. This foundation has been used for assessing different approaches for finding meaningful and descriptive terms. The old {TTM} Testbench has been extended to include usage of {WordNet} for finding collocations, performing word sense disambiguation, and finally extracting higher-level concepts and categories from the individual documents. A method for rating association rules based on the semantic similarity of the terms present in the rules has also been implemented. This was done to try to narrow down the result set, and filter out rules which are not likely to be interesting. Experiments performed with the improved application shows that the usage of {WordNet} can help increase the meaningfulness of the rules. One factor which plays a big part in this, is that synonyms of words are added to make the term more understandable. However, the experiments showed that it was difficult to decide if a rule was interesting or not, this made it impossible to draw any conclusions with regards to the suitability of semantic similarity in the rating of the rules. All work on the {TTM} Testbench so far has focused on finding association rules in web newspapers. It may however be useful to perform experiments in a more limited domain, for example medicine, where the interestingness of a rule may be more easily decided.},
school = {Norwegian University of Science and Technology},
author = {Ole Kristian Fivelstad},
month = jun, year = {2007},
keywords = {Fouille de texte},
annote = {{{\textless}p{\textgreater}fivelstadOle2007.pdf{\textless}/p{\textgreater}}} },
-
A. Mehler, "Compositionality in quantitative semantics : a theoretical perspective on text mining." , 2007.
@incollection{mehler_compositionality_2007, title = {Compositionality in quantitative semantics : a theoretical perspective on text mining},
url = {http://dx.doi.org/10.1007/978-3-540-37522-7_7},
abstract = {This chapter introduces a variant of the principle of compositionality in quantitative text semantics as an alternative to the bag-of-features approach. The variant includes effects of context-sensitive interpretation as well as processes of meaning constitution and change in the sense of usage-based semantics. Its starting point is a combination of semantic space modeling and text structure analysis. The principle is implemented by means of a hierarchical constraint satisfaction process which utilizes the notion of hierarchical text structure superimposed by graph-inducing coherence relations. The major contribution of the chapter is a conceptualization and formalization of the principle of compositionality in terms of semantic spaces which tackles some well known deficits of existing approaches. In particular this relates to the missing linguistic interpretability of statistical meaning representations.},
booktitle = {Aspects of automatic text analysis},
author = {Alexander Mehler},
year = {2007},
keywords = {Fouille de texte} },
-
T. Jiang, Ah-Hwee, and K. Wang, "Mining generalized associations of semantic relations from textual Web content," Knowledge and Data Engineering, IEEE Transactions on, vol. 19, iss. 2, pp. 164-179, 2007.
@article{tao_jiang_mining_2007, title = {Mining generalized associations of semantic relations from textual Web content},
volume = {19},
issn = {1041-4347},
url = {ieeexplore.ieee.org/iel5/69/4039276/04039281.pdf?tp=&isnumber=4039276&arnumber=4039281&punumber=%3Cb%3E%3Cfont%20color=990000%3E69%3C/font%3E%3C/b%3E},
doi = {10.1109/TKDE.2007.36},
abstract = {Traditional text mining techniques transform free text into flat bags of words representation, which does not preserve sufficient semantics for the purpose of knowledge discovery. In this paper, we present a two-step procedure to mine generalized associations of semantic relations conveyed by the textual content of Web documents. First, {RDF} (resource description framework) metadata representing semantic relations are extracted from raw text using a myriad of natural language processing techniques. The relation extraction process also creates a term taxonomy in the form of a sense hierarchy inferred from {WordNet.} Then, a novel generalized association pattern mining algorithm {(GP-Close)} is applied to discover the underlying relation association patterns on {RDF} metadata. For pruning the large number of redundant overgeneralized patterns in relation pattern search space, the {GP-Close} algorithm adopts the notion of generalization closure for systematic overgeneralization reduction. The efficacy of our approach is demonstrated through empirical experiments conducted on an online database of terrorist activities},
number = {2},
journal = {Knowledge and Data Engineering, {IEEE} Transactions on},
author = {Tao Jiang and {Ah-Hwee} Tan and Ke Wang},
year = {2007},
keywords = {Fouille de texte},
pages = {164--179},
annote = {{{\textless}p{\textgreater}jiangTao2007.pdf{\textless}/p{\textgreater}}} },
-
M. W. Berry and M. Castellanos, Survey of Text Mining II: Clustering, Classification, and Retrieval, 1 ed., Springer, 2007.
@book{berry_survey_2007, edition = {1},
title = {Survey of Text Mining {II:} Clustering, Classification, and Retrieval},
isbn = {1848000456},
shorttitle = {Survey of Text Mining {II}},
publisher = {Springer},
author = {Michael W. Berry and Malu Castellanos},
month = dec, year = {2007},
keywords = {Classification, Cluster, Fouille de texte} },
-
X. Wang, ChengXiang, X. Hu, and R. Sproat, "Mining correlated bursty topic patterns from coordinated text streams," , San Jose, California, USA, 2007, pp. 784-793.
@inproceedings{wang_mining_2007, address = {San Jose, California, {USA}},
title = {Mining correlated bursty topic patterns from coordinated text streams},
isbn = {978-1-59593-609-7},
url = {http://portal.acm.org/ft_gateway.cfm?id=1281276&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1281192.1281276},
abstract = {Previous work on text mining has almost exclusively focused on a single stream. However, we often have available multiple text streams indexed by the same set of time points (called coordinated text streams), which offer new opportunities for text mining. For example, when a major event happens, all the news articles published by different agencies in different languages tend to cover the same event for a certain period, exhibiting a correlated bursty topic pattern in all the news article streams. In general, mining correlated bursty topic patterns from coordinated text streams can reveal interesting latent associations or events behind these streams. In this paper, we define and study this novel text mining problem. We propose a general probabilistic algorithm which can effectively discover correlated bursty patterns and their bursty periods across text streams even if the streams have completely different vocabularies (e.g., English vs Chinese). Evaluation of the proposed method on a news data set and a literature data set shows that it can effectively discover quite meaningful topic patterns from both data sets: the patterns discovered from the news data set accurately reveal the major common events covered in the two streams of news articles (in English and Chinese, respectively), while the patterns discovered from two database publication streams match well with the major research paradigm shifts in database research. Since the proposed method is general and does not require the streams to share vocabulary, it can be applied to any coordinated text streams to discover correlated topic patterns that burst in multiple streams in the same period.},
publisher = {{ACM}},
author = {Xuanhui Wang and {ChengXiang} Zhai and Xiao Hu and Richard Sproat},
year = {2007},
keywords = {Fouille de texte},
pages = {784--793},
annote = {{{\textless}p{\textgreater}wangXuanhui2007.pdf{\textless}/p{\textgreater}}} },
-
Y. Liang and R. Tan, "A text-mining-based patent analysis in product innovative process," in Trends in computer aided innovation. Second IFIP Working Conference on Computer Aided Innovation, October 8–9 2007, Michigan, USA, Berlin; Heidelberg, 2007, pp. 89-96.
@inproceedings{liang_text-mining-based_2007, address = {Berlin; Heidelberg},
series = {{IFIP} international federation for information processing; 250},
title = {A text-mining-based patent analysis in product innovative process},
url = {http://dx.doi.org/10.1007/978-0-387-75456-7_9},
abstract = {Patent documents contain important technical knowledge and research results. They have high quality information to inspire designers in product development. However, they are lengthy and have much noisy results such that it takes a lot of human efforts for analysis. And due to the fact that hidden and unanticipated information plays a dominant role for {TRIZ} user, it is difficult to discern manually, thus, patent analysis has long been considered useful in product innovative process. Automatic tools for assisting innovators and patent engineers in obtaining useful information from patent documents are in great demand. In {TRIZ} theory, a product design problem can be considered as one or several Contradictions and Inventive Principles. Text mining could be used to analyze these textual documents and extract useful information from large amount documents quickly and automatically. In this paper, a computer-aided approach for extracting useful information from patent documents according to {TRIZ} Inventive Principles is proposed.},
booktitle = {Trends in computer aided innovation. Second {IFIP} Working Conference on Computer Aided Innovation, October 8–9 2007, Michigan, {USA}},
publisher = {Springer},
author = {Yanhong Liang and Runhua Tan},
year = {2007},
keywords = {Fouille de texte},
pages = {89--96} },
-
. SONG, . ZHANG, . XU, and . WANG, "Five new feature selection metrics in text categorization," International Journal of Pattern Recognition \& Artificial Intelligence, vol. 21, iss. 6, pp. 1085-1101, 2007.
@article{fengxi_song_five_2007, title = {Five new feature selection metrics in text categorization},
volume = {21},
issn = {02180014},
doi = {Article},
abstract = {Feature selection has been extensively applied in statistical pattern recognition as a mechanism for cleaning up the set of features that are used to represent data and as a way of improving the performance of classifiers. Four schemes commonly used for feature selection are Exponential Searches, Stochastic Searches, Sequential Searches, and Best Individual Features. The most popular scheme used in text categorization is Best Individual Features as the extremely high dimensionality of text feature spaces render the other three feature selection schemes time prohibitive. This paper proposes five new metrics for selecting Best Individual Features for use in text categorization. Their effectiveness have been empirically tested on two well- known data collections, Reuters-21578 and 20 Newsgroups. Experimental results show that the performance of two of the five new metrics, Bayesian Rule and F-one Value, is not significantly below that of a good traditional text categorization selection metric, Document Frequency. The performance of another two of these five new metrics, Low Loss Dimensionality Reduction and Relative Frequency Difference, is equal to or better than that of conventional good feature selection metrics such as Mutual Information and Chi-square Statistic. {ABSTRACT} {FROM} {AUTHOR} Copyright of International Journal of Pattern Recognition \& Artificial Intelligence is the property of World Scientific Publishing Company and its content may not be copied or emailed to multiple sites or posted to a listserv without the copyright holder's express written permission. However, users may print, download, or email articles for individual use. This abstract may be abridged. No warranty is given about the accuracy of the copy. Users should refer to the original published version of the material for the full abstract. {(Copyright} applies to all Abstracts)},
number = {6},
journal = {International Journal of Pattern Recognition \& Artificial Intelligence},
author = {{FENGXI} {SONG} and {DAVID} {ZHANG} and {YONG} {XU} and {JIZHONG} {WANG}},
year = {2007},
keywords = {Catégorisation, Fouille de texte},
pages = {1085--1101},
annote = {{{\textless}p{\textgreater}Accession} Number: 26619322; {FENGXI} {SONG} 1,2; Email Address: songfengxi@yahoo.com; {ZHANG,} {DAVID} 3; {YONG} {XU} 2; {JIZHONG} {WANG} 1; Affiliations: 1: Department of Automation and Simulation, 451 Huang Shan Road Hefei, Anhui 230031, P. R. China; 2: Shenzhen Graduate School, Harbin Institute of Technology; 3: Hong Kong Polytechnic University; Issue Info: Sep2007, Vol. 21 Issue 6, p1085; Thesaurus Term: {TEXT} processing {(Computer} science); Thesaurus Term: {TEXT} files; Subject Term: {TEXT} mining {(Information} retrieval); Subject Term: {PATTERN} recognition systems; Subject Term: {PATTERN} perception; Subject Term: {MULTIPLE} comparisons {(Statistics);} {Author-Supplied} Keyword: Feature selection; {Author-Supplied} Keyword: multiple comparative test; {Author-Supplied} Keyword: pattern recognition; {Author-Supplied} Keyword: support vector machines; {Author-Supplied} Keyword: text categorization; Number of Pages: 17p; Illustrations: 1 chart, 5 graphs, 2 color; Document Type: Article{\textless}/p{\textgreater}} },
-
G. Nenadić and S. Ananiadou, "Mining semantically related terms from biomedical literature," ACM Transactions on Asian Language Information Processing (TALIP), vol. 5, iss. 1, pp. 22-43, 2006.
@article{nenadi_mining_2006, title = {Mining semantically related terms from biomedical literature},
volume = {5},
url = {http://doi.acm.org/10.1145/1131348.1131351},
abstract = {Discovering links and relationships is one of the main challenges in biomedical research, as scientists are interested in uncovering entities that have similar functions, take part in the same processes, or are coregulated. This article discusses the extraction of such semantically related entities (represented by domain terms) from biomedical literature. The method combines various text-based aspects, such as lexical, syntactic, and contextual similarities between terms. Lexical similarities are based on the level of sharing of word constituents. Syntactic similarities rely on expressions (such as term enumerations and conjunctions) in which a sequence of terms appears as a single syntactic unit. Finally, contextual similarities are based on automatic discovery of relevant contexts shared among terms. The approach is evaluated using the Genia resources, and the results of experiments are presented. Lexical and syntactic links have shown high precision and low recall, while contextual similarities have resulted in significantly higher recall with moderate precision. By combining the three metrics, we achieved F measures of 68? for semantically related terms and 37? for highly related entities.},
number = {1},
journal = {{ACM} Transactions on Asian Language Information Processing {(TALIP)}},
author = {Goran Nenadić and Sophia Ananiadou},
year = {2006},
keywords = {Fouille de texte},
pages = {22 -- 43} },
-
H. Azzag, C. Guinot, and G. Venturini, "Data and text mining with hierarchical clustering ants." Berlin; Heidelberg: Springer, 2006, pp. 153-189.
@incollection{azzag_data_2006, address = {Berlin; Heidelberg},
series = {Studies in computational intelligence; 34},
title = {Data and text mining with hierarchical clustering ants},
url = {http://dx.doi.org/10.1007/978-3-540-34956-3_7},
abstract = {Without Abstract},
booktitle = {Swarm intelligence in data mining},
publisher = {Springer},
author = {Hanene Azzag and Christiane Guinot and Gilles Venturini},
year = {2006},
keywords = {Fouille de donnée, Fouille de texte},
pages = {153--189} },
-
P. Juola, J. Sofko, and P. Brennan, "A prototype for authorship attribution studies," Literary and Linguist Computing, vol. 21, iss. 2, pp. 169-178, 2006.
@article{juola_prototype_2006, title = {A prototype for authorship attribution studies},
volume = {21},
doi = {10.1093/llc/fql019},
abstract = {Despite a century of research, statistical and computational methods for authorship attribution are neither reliable, well-regarded, widely used, or well-understood. This article presents a survey of the current state of the art as well as a framework for uniform and unified development of a tool to apply the state of the art, despite the wide variety of methods and techniques used. The usefulness of the framework is confirmed by the development of a tool using that framework that can be applied to authorship analysis by researchers without a computing specialization. Using this tool, it may be possible both to expand the pool of available researchers as well as to enhance the quality of the overall solutions [for example, by incorporating improved algorithms as discovered through empirical analysis {(Juola,} P. (2004a). Ad-hoc Authorship Attribution Competition. In Proceedings 2004 Joint International Conference of the Association for Literary and Linguistic Computing and the Association for Computers and the Humanities {(ALLC/ACH} 2004), Goteborg, Sweden)].},
number = {2},
journal = {Literary and Linguist Computing},
author = {Patrick Juola and John Sofko and Patrick Brennan},
year = {2006},
keywords = {Catalogage, Fouille de texte},
pages = {169--178},
annote = {{{\textless}p{\textgreater}juolaPatrick2006.pdf{\textless}/p{\textgreater}}} },
-
Y. Bi, S. McClean, and T. Anderson, "Combining rough decisions for intelligent text mining using Dempster’s rule," Artificial Intelligence Review, vol. 26, iss. 3, pp. 191-209, 2006.
@article{bi_combining_2006, title = {Combining rough decisions for intelligent text mining using Dempster’s rule},
volume = {26},
url = {http://dx.doi.org/10.1007/s10462-007-9049-y},
doi = {10.1007/s10462-007-9049-y},
abstract = {Abstract An important issue in text mining is how to make use of multiple pieces knowledge discovered to improve future decisions. In this paper, we propose a new approach to combining multiple sets of rules for text categorization using Dempster’s rule of combination. We develop a boosting-like technique for generating multiple sets of rules based on rough set theory and model classification decisions from multiple sets of rules as pieces of evidence which can be combined by Dempster’s rule of combination. We apply these methods to 10 of the 20-newsgroups—a benchmark data collection {(Baker} and {McCallum} 1998), individually and in combination. Our experimental results show that the performance of the best combination of the multiple sets of rules on the 10 groups of the benchmark data is statistically significant and better than that of the best single set of rules. The comparative analysis between the {Dempster–Shafer} and the majority voting {(MV)} methods along with an overfitting study confirm the advantage and the robustness of our approach.},
number = {3},
journal = {Artificial Intelligence Review},
author = {Yaxin Bi and Sally {McClean} and Terry Anderson},
month = nov, year = {2006},
keywords = {Fouille de texte},
pages = {191--209} },
-
D. Mladenič, "Text mining in action!," in From data and information analysis to knowledge engineering : proceedings of the 29th annual conference of the Gesellschaft für Klassifikation e.V. University of Magdeburg, march 9–11, 2005, Berlin; Heidelberg, 2006, pp. 65-62.
@inproceedings{mladeni_text_2006, address = {Berlin; Heidelberg},
series = {Studies in classification, data analysis, and knowledge organization},
title = {Text mining in action!},
isbn = {978-3-540-31313-7},
url = {http://dx.doi.org/10.1007/3-540-31314-1_6},
abstract = {Text mining methods have being successfully used on different problems, where text data is involved. Some Text mining approaches are capable of handling text just relying on statistics such as, frequency of words or phrases, while others assume availability of additional resources such as, natural language processing tools for the language in which the text is written; availability of lexicons; ontologies of concepts; aligned corpus in several languages; additional data sources such as, links between the text units or other non-textual data. This paper aims at illustrating potential of Text mining by presenting several approaches having some of the listed properties. For this purpose, we present research applications that were developed mainly inside European projects in collaboration with end-users and, research prototypes that do not necessary involve end-users.},
booktitle = {From data and information analysis to knowledge engineering : proceedings of the 29th annual conference of the Gesellschaft für Klassifikation {e.V.} University of Magdeburg, march 9–11, 2005},
publisher = {Springer},
author = {Dunja Mladenič},
year = {2006},
keywords = {Fouille de texte},
pages = {65--62} },
-
J. Vivaldi and H. Rodriguez, "Some notes about the evaluation of terms and term extraction systems," in 5th International Conference on Language Resources and Evaluation, 2006.
@inproceedings{vivaldi_notes_2006, title = {Some notes about the evaluation of terms and term extraction systems},
booktitle = {5th International Conference on Language Resources and Evaluation},
author = {Jorge Vivaldi and Horacio Rodriguez},
year = {2006},
keywords = {Extraction d'information, Fouille de texte} },
-
G. J. J. Adeva and R. Calvo, "Mining Text with Pimiento," IEEE Internet Computing, vol. 10, iss. 4, pp. 27-35, 2006.
@article{adeva_mining_2006, title = {Mining Text with Pimiento},
volume = {10},
url = {http://portal.acm.org/citation.cfm?id=1158822.1159024},
abstract = {Information systems are using an increasing amount of unstructured information in the form of text. This situation has spawned a need to improve the text-mining technologies needed for information retrieval, filtering, and classification. This article compares some of the options available and how they can provide textual data-mining functionalities to software applications. In particular, the authors focus on Pimiento, a new object-oriented application framework for text mining. This framework allows developers to easily create distributed applications that use machine learning and statistical techniques to automatically process documents.},
number = {4},
journal = {{IEEE} Internet Computing},
author = {J. J. García Adeva and R. Calvo},
year = {2006},
keywords = {Cluster, Extraction d'information, Fouille de texte},
pages = {27--35},
annote = {{{\textless}p{\textgreater}adevaJuanJose2006.pdf{\textless}/p{\textgreater}}} },
-
A. Culotta, A. McCallum, and J. Betz, "Integrating probabilistic extraction models and data mining to discover relations and patterns in text," , New York, New York, 2006, pp. 296-303.
@inproceedings{culotta_integrating_2006, address = {New York, New York},
title = {Integrating probabilistic extraction models and data mining to discover relations and patterns in text},
url = {http://portal.acm.org/ft_gateway.cfm?id=1220873&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {In order for relation extraction systems to obtain human-level performance, they must be able to incorporate relational patterns inherent in the data (for example, that one's sister is likely one's mother's daughter, or that children are likely to attend the same college as their parents). Hand-coding such knowledge can be time-consuming and inadequate. Additionally, there may exist many interesting, unknown relational patterns that both improve extraction performance and provide insight into text. We describe a probabilistic extraction model that provides mutual benefits to both "top-down" relational pattern discovery and "bottom-up" relation extraction.},
publisher = {Association for Computational Linguistics},
author = {Aron Culotta and Andrew {McCallum} and Jonathan Betz},
year = {2006},
keywords = {Approche probabiliste, Fouille de donnée, Fouille de texte},
pages = {296--303},
annote = {{{\textless}p{\textgreater}culottaAron2006.pdf{\textless}/p{\textgreater}}} },
-
H. Kim, "On text mining algorithms for automated maintenance of hierarchical knowledge directory," in Knowledge science, engineering and management. First International Conference, KSEM 2006, Guilin, China, August 5-8, 2006. Proceedings, Berlin; Heidelberg, 2006, pp. 202-214.
@inproceedings{kim_text_2006, address = {Berlin; Heidelberg},
series = {Lecture notes in computer science; 4092},
title = {On text mining algorithms for automated maintenance of hierarchical knowledge directory},
url = {http://dx.doi.org/10.1007/11811220_18},
abstract = {This paper presents a series of text-mining algorithms for managing knowledge directory, which is one of the most crucial problems in constructing knowledge management systems today. In future systems, the constructed directory, in which knowledge objects are automatically classified, should evolve so as to provide a good indexing service, as the knowledge collection grows or its usage changes. One challenging issue is how to combine manual and automatic organization facilities that enable a user to flexibly organize obtained knowledge by the hierarchical structure over time. To this end, I propose three algorithms that utilize text mining technologies: semi-supervised classification, semi-supervised clustering, and automatic directory building. Through experiments using controlled document collections, the proposed approach is shown to significantly support hierarchical organization of large electronic knowledge base with minimal human effort.},
booktitle = {Knowledge science, engineering and management. First International Conference, {KSEM} 2006, Guilin, China, August 5-8, 2006. Proceedings},
publisher = {Springer},
author = {Han-joon Kim},
year = {2006},
keywords = {Fouille de texte},
pages = {202--214} },
-
S. Ananiadou and J. McNaught, Text mining for biology and biomedicine, Boston: Artech House, 2006.
@book{ananiadou_text_2006, address = {Boston},
title = {Text mining for biology and biomedicine},
isbn = {{158053984X}},
publisher = {Artech House},
author = {Sophia Ananiadou and John {McNaught}},
year = {2006},
keywords = {Fouille de texte} },
-
K. McGarry, "Recent trends in knowledge and data integration for the life sciences," Expert Systems, vol. 23, pp. 330-341, 2006.
@article{mcgarry_recent_2006, title = {Recent trends in knowledge and data integration for the life sciences},
volume = {23},
url = {http://www.ingentaconnect.com/content/bpl/exsy/2006/00000023/00000005/art00009},
abstract = {The bioscience field has seen some spectacular advances in genomic and proteomic technologies that are able to deliver vast quantities of information on cellular activity. Such technologies are of critical importance to biology, medical science and in drug discovery. However, living systems are highly complex and to fully exploit these technologies requires knowledge at many different levels. Information such as genome sequence data, gene expression data, protein-to-protein interactions and metabolic pathways is required to understand the complexity of biological processes. The challenge for bioinformatics is to tackle the problem of fragmentation of knowledge by integrating the many sources of heterogeneous information into a coherent entity. Another problem is that the high level of biological complexity and the fragmented nature of biological research has meant that it is difficult to keep fully conversant with the latest research and discoveries. Progress in one area of biology may have implications for other areas but the dissemination of this knowledge is not straightforward; difficulties such as differences in naming conventions for genes and biological processes has led to confusion and the lack of productivity. This paper reviews the most recent research to overcome the fragmentation problem where technologies such as text mining and ontologies are used within the knowledge discovery process and the specific technical challenges they address.},
journal = {Expert Systems},
author = {Ken {McGarry}},
year = {2006},
keywords = {Bio informatic, Fouille de texte, Ontologie},
pages = {330--341} },
-
L. Dey, A. C. Rastogi, and S. Kumar, "Generating concept ontologies through text mining." 2006, pp. 23-32.
@inproceedings{dey_generating_2006, title = {Generating concept ontologies through text mining},
isbn = {0-7695-2747-7},
url = {http://portal.acm.org/ft_gateway.cfm?id=1249038&type=pdf&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
abstract = {Designing mechanisms for creating concept ontologies automatically is an important research problem. In this work we have proposed a rough-set based mechanism to generate concept ontologies with concepts mined from documents. When the concept ontology is mined from preclassified documents, the output signifies the core set of domain concepts and their inter-relationships that define the categories, as well as the inter-category relationships. When the ontology is mined from a heterogeneous collection, the documents are first clustered into homogeneous groups and then mined for concepts. Rough set based lower and upper approximations have been used to identify core concepts and associated concepts for a domain or a group. The scheme has been tested over multiple domains.},
publisher = {{IEEE} Computer Society},
author = {Lipika Dey and Ashish Chandra Rastogi and Sachin Kumar},
year = {2006},
keywords = {Fouille de texte, Ontologie},
pages = {23--32},
annote = {{{\textless}p{\textgreater}deyLipika2006.pdf{\textless}/p{\textgreater}}} },
-
H. Cherfi, A. Napoli, and Y. Toussaint, "Towards a text mining methodology using association rule extraction," Soft Computing, vol. 10, iss. 5, pp. 431-441, 2006.
@article{cherfi_towardstext_2006, title = {Towards a text mining methodology using association rule extraction},
volume = {10},
issn = {1432-7643},
url = {http://www.jstor.org/page/termsConfirm.jsp?redirectUri=/stable/pdfplus/2184081.pdf},
abstract = {This paper proposes a methodology for text mining relying on the classical knowledge discovery loop, with a number of adaptations. First, texts are indexed and prepared to be processed by frequent itemset levelwise search. Association rules are then extracted and interpreted, with respect to a set of quality measures and domain knowledge, under the control of an analyst. The article includes an experimentation on a real-world text corpus holding on molecular biology.},
number = {5},
journal = {Soft Computing},
author = {H. Cherfi and A. Napoli and Y. Toussaint},
month = mar, year = {2006},
keywords = {Fouille de texte},
pages = {431--441},
annote = {{{\textless}p{\textgreater}cherfiH2006.pdf{\textless}/p{\textgreater}}} },
-
L. J. Jensen, J. Saric, and P. Bork, "Literature mining for the biologist : from information retrieval to biological discovery," Nat Rev Genet, vol. 7, iss. 2, pp. 119-29, 2006.
@article{jensen_literature_2006, title = {Literature mining for the biologist : from information retrieval to biological discovery},
volume = {7},
shorttitle = {Literature mining for the biologist},
number = {2},
journal = {Nat Rev Genet},
author = {L. J. Jensen and J. Saric and P. Bork},
year = {2006},
keywords = {Extraction d'information, Fouille de texte},
pages = {119--29} },
-
Q. Mei and ChengXiang, "A mixture model for contextual text mining," , Philadelphia, PA, USA, 2006, pp. 649-655.
@inproceedings{mei_mixture_2006, address = {Philadelphia, {PA,} {USA}},
title = {A mixture model for contextual text mining},
isbn = {1-59593-339-5},
url = {http://portal.acm.org/citation.cfm?id=1150402.1150482&coll=ACM&dl=ACM&CFID=76094285&CFTOKEN=90415435},
doi = {10.1145/1150402.1150482},
abstract = {Contextual text mining is concerned with extracting topical themes from a text collection with context information (e.g., time and location) and comparing/analyzing the variations of themes over different contexts. Since the topics covered in a document are usually related to the context of the document, analyzing topical themes within context can potentially reveal many interesting theme patterns. In this paper, we generalize some of these models proposed in the previous work and we propose a new general probabilistic model for contextual text mining that can cover several existing models as special cases. Specifically, we extend the probabilistic latent semantic analysis {(PLSA)} model by introducing context variables to model the context of a document. The proposed mixture model, called contextual probabilistic latent semantic analysis {(CPLSA)} model, can be applied to many interesting mining tasks, such as temporal text mining, spatiotemporal text mining,
author-topic analysis, and cross-collection comparative analysis. Empirical experiments show that the proposed mixture model can discover themes and their contextual variations effectively.},
publisher = {{ACM}},
author = {Qiaozhu Mei and {ChengXiang} Zhai},
year = {2006},
keywords = {Fouille de texte},
pages = {649--655},
annote = {{{\textless}p{\textgreater}meiQiaozhu2006.pdf{\textless}/p{\textgreater}}} },
-
C. Baker and R. Witte, "Mutation Mining A Prospector’s Tale," Information Systems Frontiers, vol. 8, iss. 1, pp. 47-57, 2006.
@article{baker_mutation_2006, title = {Mutation Mining A Prospector's Tale},
volume = {8},
abstract = {Protein structure visualization tools render images that allow the user to explore structural features of a protein. Context specific information relating to a particular protein or protein family is, however, not easily integrated and must be uploaded from databases or provided through manual curation of input files. Protein Engineers spend considerable time iteratively reviewing both literature and protein structure visualizations manually annotated with mutated residues. Meanwhile, text mining tools are increasingly used to extract specific units of raw text from scientific literature and have demonstrated the potential to support the activities of Protein Engineers. The transfer of mutation specific raw-text annotations to protein structures requires integrated data processing pipelines that can co-ordinate information retrieval, information extraction, protein sequence retrieval, sequence alignment and mutant residue mapping. We describe the Mutation Miner pipeline designed for this purpose and present case study evaluations of the key steps in the process. Starting with literature about mutations made to protein families; haloalkane dehalogenase, bi-phenyl dioxygenase, and xylanase we enumerate relevant documents available for text mining analysis, the available electronic formats, and the number of mutations made to a given protein family. We review the efficiency of {NLP} driven protein sequence retrieval from databases and report on the effectiveness of Mutation Miner in mapping annotations to protein structure visualizations. We highlight the feasibility and practicability of the approach.},
number = {1},
journal = {Information Systems Frontiers},
author = {Christopher Baker and Rene Witte},
year = {2006},
keywords = {Fouille de donnée, Fouille de texte},
pages = {47--57} },
-
G. Paaß and H. Vries, "Evaluating the performance of text mining systems on real-world press archives," in From data and information analysis to knowledge engineering. Proceedings of the 29th Annual Conference of the Gesellschaft für Klassifikation e.V. University of Magdeburg, March 9–11, 2005, Berlin; Heidelberg, 2006, pp. 414-421.
@inproceedings{paa_evaluatingperformance_2006, address = {Berlin; Heidelberg},
series = {Studies in classification, data analysis, and knowledge organization},
title = {Evaluating the performance of text mining systems on real-world press archives},
url = {http://dx.doi.org/10.1007/3-540-31314-1_50},
abstract = {We investigate the performance of text mining systems for annotating press articles in two real-world press archives. Seven commercial systems are tested which recover the categories of a document as well named entities and catchphrases. Using cross-validation we evaluate the precision-recall characteristic. Depending on the depth of the category tree 39–79\% breakeven is achieved. For one corpus 45\% of the documents can be classified automatically, based on the system’s confidence estimates. In a usability experiment the formal evaluation results